# -*- coding: utf-8 -*-
"""
PROJECT: ANOMALIES AS NEW HEDGE FUND FACTORS
    CODE CATEGORY: MASTER OUTPUT
        CODE NAME: MASTER_CODE_TABLES_FIGURES.py
    
    This is the python code that generates the main results and figures in "Anomalies
    as New Hedge Fund Factors" by Yong Chen, Sophia Li, Yushan Tang, and Guofu Zhou. 
    
    It creates the following tables and figures: 
        - Figure 1: Variance explained ratios across principal components
        - Figure 2: Distribution of hedge fund alphas
        - Figure 3: Distribution of hedge fund alphas: BarclayHedge
        - Figure 4: Distributions of bootstrap t-statistics
        
        - Table 1: Descriptive statistics for hedge fund returns
        - Table 2: Descriptive statistics for factors
        - Table 3: Factor selection via adaptive LASSO
        - Table 4: Nested model selection
        - Table 5: Model comparison
        - Table 6: Model comparison: Fund alpha distribution
        - Table 7: Model comparison: BarclayHedge
        - Table 8: Model comparison: Out-of-sample period
        - Table 9: Model comparison: Arbitrage trading
        - Table 10: Model comparison: Alternative models
        - Table 11: Flow-performance relation
        - Table 12: Fund failure likelihood
        - Table 13: Bootstrap analysis of fund alphas
        - Table 14: Fund alpha and characteristics
        
    The code is tested under Python 3.9.19, with packages and corresponding versions:
        - numpy 1.24.3
        - pandas 1.5.3 
        - sklearn 1.0.1
        - statsmodels 0.13.5
        - scipy 1.10.1
        - matplotlib 3.7.1 
        - seaborn 0.12.2
        - linearmodels 6.1
        - lifelines 0.27.4
        
        
    NOVEMBER 2022
    
    Version: January 18, 2025
    
"""






#########################
# PACKAGES AND VERSIONS #
#########################


# IMPORT PACKAGES #
import os
import numpy as np
if np.__version__ != '1.24.3':
    raise ImportError("package version of 'numpy' is not 1.24.3")

import pandas as pd
if pd.__version__ != '1.5.3':
    raise ImportError("package version of 'pandas' is not 1.5.3")
    
import sklearn
if sklearn.__version__ != '1.0.1':
    raise ImportError("package version of 'scikit-learn' is not 1.0.1")
from sklearn.metrics import r2_score
from sklearn import linear_model

import statsmodels
if statsmodels.__version__ != '0.13.5':
    raise ImportError("package version of 'statsmodels' is not 0.13.5")
import statsmodels.api as sm

import scipy
if scipy.__version__ != '1.10.1':
    raise ImportError("package version of 'scipy' is not 1.10.1")
from scipy import stats

import matplotlib
if matplotlib.__version__ != '3.7.1':
    raise ImportError("package version of 'matplotlib' is not 3.7.1")

import seaborn as sns
if sns.__version__ != '0.12.2':
    raise ImportError("package version of 'seaborn' is not 0.12.2")

import linearmodels
if linearmodels.__version__ != '6.1':
    raise ImportError("package version of 'linearmodels' is not 6.1")

import lifelines
if lifelines.__version__ != '0.27.4':
    raise ImportError("package version of 'lifelines' is not 0.27.4")






############################
# WORKING ENVIROMENT SETUP #
############################


# SET UP INPUT DATA DIRECTORY #
#dataDir = 'C:\\Users\\admin\\Dropbox\\Yong_Yushan_Sophia\\Replication_Package_JFQA-24-24882\\Data'
dataDir = 'C:\\Users\\admin\\Dropbox\\Yong_Yushan_Sophia\\Replication_Package_JFQA-24-24882\\PseudoData'
os.chdir(dataDir)


# SET UP TABLE AND FIGURE DIRECTORY #
figDir = 'C:\\Users\\admin\\Dropbox\\Yong_Yushan_Sophia\\Replication_Package_JFQA-24-24882\\Figure'
if not os.path.exists(figDir):
    os.makedirs(figDir)
tabDir = 'C:\\Users\\admin\\Dropbox\\Yong_Yushan_Sophia\\Replication_Package_JFQA-24-24882\\Table'
if not os.path.exists(tabDir):
    os.makedirs(tabDir)






############################
#         TABLE 1          # 
############################


# IMPORT HF DATA PICKLE #
#hfChar_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfChar_df = pd.read_pickle('PSEUDODATA1.pkl')


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
hfChar_df = pd.merge(left=hfChar_df, 
                     right=factor_df.loc[:, ['MONTH','RF']], 
                     how='left', on=['MONTH'])
factor_df.set_index(keys=['MONTH'], inplace=True)
factor_df.drop(columns=['RF'], inplace=True)


# HF NET-OF-FEE RETURNS #
hfChar_df['RET'] = hfChar_df['EXRET'] + hfChar_df['RF']
hfChar_df.drop(columns=['RF'], inplace=True)


# SUMMARY STATISTICS FOR HEDGE FUND UNIVERSE BY CATEGORY #
# NUMBER OF OBSERVATIONS, MEAN, MEDIAN, SD, AND PERCENTILES #
sumStat_df = pd.concat([hfChar_df['CAT'].value_counts(), 
                hfChar_df.groupby(by=['CAT'])['RET'].mean(), 
                hfChar_df.groupby(by=['CAT'])['RET'].std(), 
                hfChar_df.groupby(by=['CAT'])['RET'].quantile(q=0.1),
                hfChar_df.groupby(by=['CAT'])['RET'].quantile(q=0.25),
                hfChar_df.groupby(by=['CAT'])['RET'].median(),
                hfChar_df.groupby(by=['CAT'])['RET'].quantile(q=0.75),
                hfChar_df.groupby(by=['CAT'])['RET'].quantile(q=0.9)], axis=1)
sumStat_df.columns = ['N', 'Mean', 'SD', '10%', '25%', 'Median', '75%', '90%']


# SUMMARY STATISTICS FOR WHOLE HEDGE FUND UNIVERSE # 
# NUMBER OF OBSERVATIONS, MEAN, MEDIAN, SD, AND PERCENTILES #
sumStat_df = pd.concat([sumStat_df, 
                pd.DataFrame({'N': hfChar_df.shape[0], 'Mean': hfChar_df['RET'].mean(), 
                              'SD': hfChar_df['RET'].std(), '10%': hfChar_df['RET'].quantile(q=0.1),
                              '25%': hfChar_df['RET'].quantile(q=0.25), 'Median': hfChar_df['RET'].median(), 
                              '75%': hfChar_df['RET'].quantile(q=0.75), '90%': hfChar_df['RET'].quantile(q=0.9)}, 
                             index=['All Categories'])], axis=0)


# EQUAL-WEIGHTED PORTFOLIOS BASED ON CATEGORY #
catExRet_df = pd.concat([hfChar_df.groupby(by=['CAT','MONTH'])['RET'].mean(), 
                         hfChar_df.groupby(by=['CAT','MONTH'])['RET'].count()], 
                        axis=1)
catExRet_df.columns = ['RET','NUMFUNDS']
catExRet_df = catExRet_df.reset_index()
# 2,697 category-month observations; 10 unique categories
sumStat_df['EW Mean'] = catExRet_df.groupby(by='CAT')['RET'].mean()
sumStat_df['EW SD'] = catExRet_df.groupby(by='CAT')['RET'].std()
del catExRet_df


# VALUE-WEIGHTED PORTFOLIOS BASED ON CATEGORY #
# IF MISSING AUM, WEIGHTS SET AS ZERO TO EXCLUDE RETURNS #
hfChar_df['AUM'].fillna(0, inplace=True)


# CONSTRUCT CATEGORY HEDGE FUND EXCESS RETURNS #
catExRet_df = pd.concat([hfChar_df.groupby(by=['CAT','MONTH']).apply(lambda x: np.average(x['RET'], weights=x['AUM'])), 
                         hfChar_df.groupby(by=['CAT','MONTH'])['AUM'].count()], 
                        axis=1)
catExRet_df.columns = ['RET','NUMFUNDS']
catExRet_df = catExRet_df.reset_index()
# 2,697 category-month observations; 10 unique categories
sumStat_df['VW Mean'] = catExRet_df.groupby(by='CAT')['RET'].mean()
sumStat_df['VW SD'] = catExRet_df.groupby(by='CAT')['RET'].std()
del catExRet_df


# EQUAL-WEIGHTED INDUSTRY #
indExRet = pd.concat([hfChar_df.groupby(by=['MONTH'])['RET'].mean(), 
                      hfChar_df.groupby(by=['MONTH']).apply(lambda x: np.average(x['RET'], weights=x['AUM']))], axis=1)
indExRet.columns = ['RET_EW','RET_VW']
indExRet = indExRet.reset_index()
sumStat_df.loc['All Categories', 'EW Mean'] = indExRet['RET_EW'].mean()
sumStat_df.loc['All Categories', 'EW SD'] = indExRet['RET_EW'].std()
sumStat_df.loc['All Categories', 'VW Mean'] = indExRet['RET_VW'].mean()
sumStat_df.loc['All Categories', 'VW SD'] = indExRet['RET_VW'].std()
del indExRet


# RE-ORDER INDEX #
sumStat_df.sort_index(inplace=True)
sumStat_df = pd.concat([sumStat_df.iloc[1:,], 
                        sumStat_df.iloc[[0],:]])


# OUTPUT RESULTS #
sumStat_df.to_csv(tabDir + '\Table1.csv')
del sumStat_df


# HOUSE CLEAN #
del hfChar_df






############################
#         TABLE 2          # 
############################


# GDP IN $1,000 #
factor_df['GDP'] = factor_df['GDP'] / 1000


# SUMMARY STATISTICS #
sumStat_df = pd.concat([factor_df.count(), factor_df.mean(), 
                        factor_df.std(), factor_df.quantile(q=0.1), 
                        factor_df.quantile(q=0.25), factor_df.median(), 
                        factor_df.quantile(q=0.75), factor_df.quantile(q=0.9)], axis=1)
sumStat_df.columns = ['N', 'Mean', 'SD', '10%', '25%', 'Median', '75%', '90%']


# OUTPUT RESULTS #
sumStat_df.to_csv(tabDir + '\\Table2.csv')
del factor_df, sumStat_df






############################
#         TABLE 3          # 
############################


    ############################
    # FUNCTION AND PREPARATION #
    ############################


# FUNCTION FOR ADAPTIVE LASSO FOR GIVEN DATA #
def aLASSO(data, xNameList, yName, criterion='aic'):
    '''
    This function takes given data as fitting set, performs Adaptive LASSO with 
    weights of OLS estimates, and finaly return fitted results. 

    Parameters
    ----------
    data : pd.DataFrame
        Pandas data frame conraining fitting sample.
    xNameList : list
        List of string representations of variables.
    yName : str
        String representation of label.
    criterion : str, optional
        Criterion for determining penalty parameter in Adaptive LASSO. 
        The default is 'aic'.

    Returns
    -------
    alpha : float
        Tuned penalty parameter fir Adaptive LASSO.
    coefs : pd.DataFrame
        Series of estimated coefficients.
    numFeatures : int
        Number of variables selected.

    '''
    
    
    # STANBDARDIZE X VARIABLES #
    fitData = data.copy()
    for xName in xNameList:
        fitData[xName] = ( fitData[xName].copy() - fitData[xName].mean() ) / fitData[xName].std()
        del xName
    
    
    # OLS COEFFICIENTS AS WEIGHTS #
    ols = linear_model.LinearRegression(fit_intercept=True, n_jobs=-1).fit(X=fitData.loc[:, xNameList].copy(), 
                                                                           y=fitData[yName].copy() )
    gprime = lambda w: 1. / (np.abs(w) + np.finfo(float).eps)
    weights = gprime(ols.coef_)
    
    
    # ADJUST X WITH WEIGHTS #
    XWeighted = fitData.loc[:, xNameList].copy()/weights


    # CREATE LASSO OBJECT #
    lasso = linear_model.LassoLarsIC(criterion=criterion, fit_intercept=True, 
                                     normalize=False, max_iter=1000000)
    
    
    # SUPPLY DATA AND FIT #
    lasso.fit(X=XWeighted, y=fitData[yName].copy() )
    
    
    # BEST ALPHA #
    alpha = lasso.alpha_ 
    
    
    # GET COEFFICIENTS AND NUMBER OF VARIABLES #
    coefs = pd.DataFrame(np.insert(lasso.coef_/weights, 0, lasso.intercept_), 
                         columns=['coef'], index=['const']+xNameList)
    coefs.replace(0, np.NaN, inplace=True)
    numFeatures = coefs.iloc[1:].dropna().shape[0]
    
    
    return alpha, coefs, numFeatures


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
factor_df.drop(columns=['RF'], inplace=True)


# DEFINE FACTOR LIST AND LABEL #
XNAMELIST = [# ANOMALY FACTORS #
             'ACC', 'AGR', 'BAB', 'CARRY', 'CEI', 'DTI', 'GPF', 
             'ILL', 'INVEST', 'LLEV', 'LRSK', 'MKT', 'MOM', 'NOA',
             'NSI', 'OSC', 'PFT', 'PGR', 'QLT', 'ROA', 'SIZE', 
             'SKW', 'SSN', 'TSMOM', 'VALUE', 
             # TREND-FOLLOWING FACTORS #
             'BDTF', 'CMTF', 'FXTF', 'IRTF', 'SKTF', 
             # MACROECONOMIC FACTORS # 
             '10Y', 'BM', 'CRDT', 'DFLT', 'DPR', 'EPR', 'GDP', 
             'INFL', 'INPR', 'NTIS', 'SVAR', 'TBL', 'TRM', 'UNC']
YNAME = 'EXRET'


# RECEIVING DATA FRAME #
coefsSelect_df = pd.DataFrame(columns=['Fund', 'EW Category', 'VW Category',
                                       'EW Industry', 'VW Industry'], 
                              index=['const'] + XNAMELIST)
sumSelect_df = pd.DataFrame(columns=['Fund', 'EW Category', 'VW Category',
                                     'EW Industry', 'VW Industry'], 
                            index=['Number of Time Series', 
                                   'Average Number of Factors'])


    ###############################
    # FUND-LEVEL FACTOR SELECTION #
    ###############################


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET','CAT']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# GET FUND LIST #
fundList = list(hfExRet_df['FUNDID'].unique())
fundList.sort()
hfExRet_df.drop(columns=['CAT'], inplace=True)


# RECEIVING LIST FOR DATA FRAMES #
coefsDf_list = list()
sum_df = pd.DataFrame(columns=['Alpha', 'numFeatures', 
                               'numObs', 'startMonth', 'endMonth'], 
                      index=fundList)


# LOOP THROUGH HEDGE FUNDS #
for fundID in fundList:
    
    
    # DISPLAY STATUS TO USER
    print(f'Hedge Fund: {fundID}', end=', ')
    
    
    # GET FITTING DATA #
    data_fund = pd.merge(left=hfExRet_df.loc[hfExRet_df['FUNDID']==fundID, :].copy(), 
                         right=factor_df, how='left', on=['MONTH'])
    
    
    # DOCUMENT SAMPLE DETAILS #
    sum_df.loc[fundID, 'numObs'] = data_fund.shape[0]
    sum_df.loc[fundID, 'startMonth'] = data_fund['MONTH'].min()
    sum_df.loc[fundID, 'endMonth'] = data_fund['MONTH'].max()
    
    
    # RUN ADAPTIVE LASSO #
    alpha_fund, coefs_fund, numFeatures_fund = aLASSO(data_fund, XNAMELIST, YNAME)
    
    
    # APPEND COEFFICIENT DATA FRAME LIST #
    coefsDf_list.append(coefs_fund.copy())
    
    
    # DOCUMNET FITTING RESULTS #
    sum_df.loc[fundID, 'Alpha'] = alpha_fund
    sum_df.loc[fundID, 'numFeatures'] = numFeatures_fund
    
    
    # DISPLAY STATUS TO USER
    print('done.')
    
    
    # HOUSE CLEAN #
    del fundID, data_fund, alpha_fund, coefs_fund, numFeatures_fund
    
    
# CONCATENATE COEFFICIENT DATA FRAMES #
coefs_df = pd.concat(coefsDf_list, axis=1)
coefs_df.columns = fundList
coefs_df = coefs_df.T 
del coefsDf_list


# VARIABLE SELECTION ANALYSIS #
coefsSelect_df['Fund'] = (coefs_df.count() / coefs_df.shape[0]).values 
sumSelect_df.loc['Number of Time Series', 'Fund'] = sum_df.shape[0]
sumSelect_df.loc['Average Number of Factors', 'Fund'] = sum_df['numFeatures'].mean()


# HOUSE CLEAN #
del hfExRet_df, fundList, coefs_df, sum_df


    ##################################################
    # EQUAL-WEIGHTED CATEGORY-LEVEL FACTOR SELECTION #
    ##################################################
    

# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET', 'CAT']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# CONSTRUCT EQUAL-WEIGHTED CATEGORY HEDGE FUND EXCESS RETURNS #
catExRet_df = pd.concat([hfExRet_df.groupby(by=['CAT','MONTH'])['EXRET'].mean(), 
                         hfExRet_df.groupby(by=['CAT','MONTH'])['EXRET'].count()], 
                        axis=1)
catExRet_df.columns = ['EXRET','NUMFUNDS']
catExRet_df = catExRet_df.reset_index()
del hfExRet_df
# 2,697 category-month observations; 10 unique categories
catList = list(catExRet_df['CAT'].unique())


# RECEIVING DATA FRAMES AND LIST FOR DATA FRAMES #
coefsDf_list = list()
sum_df = pd.DataFrame(columns=['Alpha', 'numFeatures', 
                               'numObs', 'startMonth', 'endMonth'], 
                      index=catList)


# LOOP THROUGH HEDGE FUND CATEGORIES #
for cat in catList:
    
    
    # DISPLAY STATUS TO USER
    print(f'Hedge Fund Category: {cat}', end=', ')
    
    
    # GET FITTING DATA #
    data_cat = pd.merge(left=catExRet_df.loc[catExRet_df['CAT']==cat, :].copy(), 
                        right=factor_df, how='left', on=['MONTH'])
    
    
    # DOCUMENT SAMPLE DETAILS #
    sum_df.loc[cat, 'numObs'] = data_cat.shape[0]
    sum_df.loc[cat, 'startMonth'] = data_cat['MONTH'].min()
    sum_df.loc[cat, 'endMonth'] = data_cat['MONTH'].max()
    
    
    # RUN ADAPTIVE LASSO #
    alpha_cat, coefs_cat, numFeatures_cat = aLASSO(data_cat, XNAMELIST, YNAME)
    
    
    # APPEND COEFFICIENT DATA FRAME LIST #
    coefsDf_list.append(coefs_cat.copy())
    
    
    # DOCUMNET FITTING RESULTS #
    sum_df.loc[cat, 'Alpha'] = alpha_cat
    sum_df.loc[cat, 'numFeatures'] = numFeatures_cat 
    
    
    # DISPLAY STATUS TO USER
    print('done.')
    
    
    # HOUSE CLEAN #
    del cat, data_cat, alpha_cat, coefs_cat, numFeatures_cat
    
    
# CONCATENATE COEFFICIENT DATA FRAMES #
coefs_df = pd.concat(coefsDf_list, axis=1)
coefs_df.columns = catList
coefs_df = coefs_df.T 
del coefsDf_list


# VARIABLE SELECTION ANALYSIS #
coefsSelect_df['EW Category'] = (coefs_df.count() / coefs_df.shape[0]).values 
sumSelect_df.loc['Number of Time Series', 'EW Category'] = sum_df.shape[0]
sumSelect_df.loc['Average Number of Factors', 'EW Category'] = sum_df['numFeatures'].mean()


# HOUSE CLEAN #
del catExRet_df, catList, coefs_df, sum_df


    ##################################################
    # VALUE-WEIGHTED CATEGORY-LEVEL FACTOR SELECTION #
    ##################################################
    

# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET', 'CAT', 'AUM']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# IF MISSING AUM, WEIGHTS SET AS ZERO TO EXCLUDE RETURNS #
hfExRet_df['AUM'].fillna(0, inplace=True)


# CONSTRUCT VALUE-WEIGHTED CATEGORY HEDGE FUND EXCESS RETURNS #
catExRet_df = pd.concat([hfExRet_df.groupby(by=['CAT','MONTH']).apply(lambda x: np.average(x['EXRET'], weights=x['AUM'])), 
                         hfExRet_df.groupby(by=['CAT','MONTH'])['AUM'].count()], 
                        axis=1)
catExRet_df.columns = ['EXRET','NUMFUNDS']
catExRet_df = catExRet_df.reset_index()
del hfExRet_df
# 2,697 category-month observations; 10 unique categories
catList = list(catExRet_df['CAT'].unique())


# RECEIVING DATA FRAMES AND LIST FOR DATA FRAMES #
coefsDf_list = list()
sum_df = pd.DataFrame(columns=['Alpha', 'numFeatures', 
                               'numObs', 'startMonth', 'endMonth'], 
                      index=catList)


# LOOP THROUGH HEDGE FUND CATEGORIES #
for cat in catList:
    
    
    # DISPLAY STATUS TO USER
    print(f'Hedge Fund Category: {cat}', end=', ')
    
    
    # GET FITTING DATA #
    data_cat = pd.merge(left=catExRet_df.loc[catExRet_df['CAT']==cat, :].copy(), 
                        right=factor_df, how='left', on=['MONTH'])
    
    
    # DOCUMENT SAMPLE DETAILS #
    sum_df.loc[cat, 'numObs'] = data_cat.shape[0]
    sum_df.loc[cat, 'startMonth'] = data_cat['MONTH'].min()
    sum_df.loc[cat, 'endMonth'] = data_cat['MONTH'].max()
    
    
    # RUN ADAPTIVE LASSO #
    alpha_cat, coefs_cat, numFeatures_cat = aLASSO(data_cat, XNAMELIST, YNAME)
    
    
    # APPEND COEFFICIENT DATA FRAME LIST #
    coefsDf_list.append(coefs_cat.copy())
    
    
    # DOCUMNET FITTING RESULTS #
    sum_df.loc[cat, 'Alpha'] = alpha_cat
    sum_df.loc[cat, 'numFeatures'] = numFeatures_cat 
    
    
    # DISPLAY STATUS TO USER
    print('done.')
    
    
    # HOUSE CLEAN #
    del cat, data_cat, alpha_cat, coefs_cat, numFeatures_cat
    
    
# CONCATENATE COEFFICIENT DATA FRAMES #
coefs_df = pd.concat(coefsDf_list, axis=1)
coefs_df.columns = catList
coefs_df = coefs_df.T 
del coefsDf_list


# VARIABLE SELECTION ANALYSIS #
coefsSelect_df['VW Category'] = (coefs_df.count() / coefs_df.shape[0]).values 
sumSelect_df.loc['Number of Time Series', 'VW Category'] = sum_df.shape[0]
sumSelect_df.loc['Average Number of Factors', 'VW Category'] = sum_df['numFeatures'].mean()


# HOUSE CLEAN #
del catExRet_df, catList, coefs_df, sum_df


    ###################################
    # INDUSTRY-LEVEL FACTOR SELECTION #
    ###################################
    

# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET', 'AUM']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# IF MISSING AUM, WEIGHTS SET AS ZERO TO EXCLUDE RETURNS #
hfExRet_df['AUM'].fillna(0, inplace=True)


# CONSTRUCT EW AND VW HEDGE FUND INDUSTRY RETURNS #
indExRet = pd.concat([hfExRet_df.groupby(by=['MONTH'])['EXRET'].mean(), 
                      hfExRet_df.groupby(by=['MONTH']).apply(lambda x: np.average(x['EXRET'], weights=x['AUM']))], axis=1)
indExRet.columns = ['EXRET_EW','EXRET_VW']
indExRet = indExRet.reset_index()
del hfExRet_df


        ###############
        # EW INDUSTRY #
        ###############


# DISPLAY STATUS TO USER
YNAME = 'EXRET_EW'
print(f'Hedge Fund Industry: {YNAME}')
    
    
# GET FITTING DATA #
data_ind = pd.merge(left=indExRet.loc[:, ['MONTH', YNAME]].copy(), 
                    right=factor_df, how='left', on=['MONTH'])


# RUN ADAPTIVE LASSO #
alpha_ind, coefs_ind, numFeatures_ind = aLASSO(data_ind, XNAMELIST, YNAME)


# VARIABLE SELECTION ANALYSIS #
coefsSelect_df['EW Industry'] = coefs_ind.values 
sumSelect_df.loc['Number of Time Series', 'EW Industry'] = 1
sumSelect_df.loc['Average Number of Factors', 'EW Industry'] = numFeatures_ind


# HOUSE CLEAN #
del YNAME, data_ind, alpha_ind, coefs_ind, numFeatures_ind


        ###############
        # VW INDUSTRY #
        ###############


# DISPLAY STATUS TO USER
YNAME = 'EXRET_VW'
print(f'Hedge Fund Industry: {YNAME}')
    
    
# GET FITTING DATA #
data_ind = pd.merge(left=indExRet.loc[:, ['MONTH', YNAME]].copy(), 
                    right=factor_df, how='left', on=['MONTH'])


# RUN ADAPTIVE LASSO #
alpha_ind, coefs_ind, numFeatures_ind = aLASSO(data_ind, XNAMELIST, YNAME)


# VARIABLE SELECTION ANALYSIS #
coefsSelect_df['VW Industry'] = coefs_ind.values 
sumSelect_df.loc['Number of Time Series', 'VW Industry'] = 1
sumSelect_df.loc['Average Number of Factors', 'VW Industry'] = numFeatures_ind


# HOUSE CLEAN #
del YNAME, data_ind, alpha_ind, coefs_ind, numFeatures_ind
del XNAMELIST, indExRet


        ##################
        # OUTPUT RESULTS #
        ##################


# RANKS OF SELECTION RATE #
coefsSelect_df = coefsSelect_df.iloc[1:, :].copy()
for col in ['Fund', 'EW Category', 'VW Category']: 
    
    
    # RANKS OF VARIABLES BASED ON SELECTION RATES #
    colRank = coefsSelect_df[col].rank(method='min', ascending=False)
    
    
    # ONLY KEEP TOP 10 RANKS #
    colRank.loc[colRank>10] = np.NaN
    colRank.loc[colRank<=10] = 1 
    coefsSelect_df[col] = colRank.copy()
    
    
    # HOUSE CLEAN #
    del col, colRank
coefsSelect_df.loc[coefsSelect_df['EW Industry'].notna(), 'EW Industry'] = 1
coefsSelect_df.loc[coefsSelect_df['VW Industry'].notna(), 'VW Industry'] = 1 
coefsSelect_df = coefsSelect_df.dropna(how='all')


# REORDER INDEX #
# =============================================================================
# newIdx = [# SELECTED BY 5 LEVELS #
#           'MKT', 'BAB', 'LRSK', 'CRDT',
#           # SELECTED BY 4 LEVELS #
#           'TSMOM', 'AGR',
#           # SELECTED BY 3 LEVELS #
#           'ROA', '10Y', 'TRM',
#           # SELECTED BY 2 LEVELS #
#           'MOM', 'SVAR', 'UNC', 'IRTF', 'GPF', 'INVEST', 
#           # SELECTED BY 1 LEVELS #
#           'PFT', 'SSN', 'VALUE', 'DPR']
# coefsSelect_df = coefsSelect_df.loc[newIdx, :].copy()
# del newIdx
# =============================================================================


# OUTPUT TABLE #
out_df = pd.concat([sumSelect_df, coefsSelect_df])
del sumSelect_df, coefsSelect_df
out_df.to_csv(tabDir + '\\Table3.csv')
del out_df






############################
#         TABLE 4          # 
############################


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET', 'AUM']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# IF MISSING AUM, WEIGHTS SET AS ZERO TO EXCLUDE RETURNS #
hfExRet_df['AUM'].fillna(0, inplace=True)


# CONSTRUCT EW AND VW HEDGE FUND INDUSTRY RETURNS #
indExRet = pd.concat([hfExRet_df.groupby(by=['MONTH'])['EXRET'].mean(), 
                      hfExRet_df.groupby(by=['MONTH']).apply(lambda x: np.average(x['EXRET'], weights=x['AUM']))], axis=1)
indExRet.columns = ['EXRET_EW','EXRET_VW']
indExRet = indExRet.reset_index()
del hfExRet_df


# MERGE WITH FACTOR TABLE #
indExRet = pd.merge(left=indExRet, right=factor_df, how='left', on=['MONTH'])


    #####################################
    # FUNCTION FOR DIEBOLD-MARIANO TEST #
    #####################################


# DM TEST FUNCTION WITH NEWEY-WEST ADJUSTMENT OPTION # 
def DMTest(y_true, y_pred1, y_pred2, maxlags):
    """
    Diebold-Mariano Test with Newey-West Adjustment Options
  
    Parameters: 
    y_true (pd.Series): True label series
    y_pred1 (pd.Series): Predicted label series from learner 1
    y_pred2 (pd.Series): Predicted label series from learner 1
    maxlags (int): number of maximum lag terms
  
    Returns: 
    t-stat, p-value
    """
    
    
    # CHECK INPUTS #
    assert isinstance(y_true, pd.Series), "Only take Pandas Series as true label input."
    assert isinstance(y_pred1, pd.Series), "Only take Pandas Series as pred1 label input."
    assert isinstance(y_pred2, pd.Series), "Only take Pandas Series as pred2 label input."
    assert (len(y_true)==len(y_pred1))&(len(y_true)==len(y_pred2))&(len(y_pred1)==len(y_pred2)), \
        "Input should have same number of observations."
    
    
    # CALCULATE SQUARED ERRORS #
    sqerr_1 = (y_pred1-y_true)**2
    sqerr_2 = (y_pred2-y_true)**2
    d = sqerr_2 - sqerr_1
    
    
    # OLS-BASED DM TEST #
    if maxlags > 0: 
        dmtest = sm.OLS(d,np.ones(len(d))).fit(cov_type='HAC',cov_kwds={'maxlags':maxlags})
    else:
        dmtest = sm.OLS(d,np.ones(len(d))).fit()

    return dmtest.tvalues[0], dmtest.pvalues[0]


# RECEIVING DATA FRAMES #
out_df = pd.DataFrame(columns=['EW 4-Factor', 'EW 6-Factor', 'EW 9-Factor',
                               'VW 4-Factor', 'VW 6-Factor', 'VW 9-Factor'], 
                      index=['const', 'const_t', 
                             'MKT', 'MKT_t', 'BAB', 'BAB_t', 'LRSK', 'LRSK_t', 
                             'TSMOM', 'TSMOM_t', 'AGR', 'AGR_t', 'ROA', 'ROA_t', 
                             'CRDT', 'CRDT_t', '10Y', '10Y_t', 'TRM', 'TRM_t',  
                             'NumObs', 'AdjR2', 'AIC', 'BIC'])
dm_df = pd.DataFrame(index=['EW 4-Factor', 'EW 6-Factor', 'EW 9-Factor', 
                            'VW 4-Factor', 'VW 6-Factor', 'VW 9-Factor'], 
                     columns=['EW 4-Factor', 'EW 6-Factor', 'EW 9-Factor',
                              'VW 4-Factor', 'VW 6-Factor', 'VW 9-Factor'])


    #################################################
    # EW INDUSTRY PORTFOLIO TIME-SERIES REGRESSIONS #
    #################################################


# MODEL RECEIVING DICTIONARY #
mod_dict = dict()


        ##################
        # 4 FACTOR MODEL #
        ##################


# DEFINE FACTOR LIST AND LABEL #
XNAMELIST = ['BAB', 'LRSK', 'MKT', 'CRDT']
YNAME = 'EXRET_EW'


# OLS FIT #
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
mod_dict['EW 4-Factor'] = factorModel


# OLS RESULTS #
coefIndex_list = ['const', 'const_t']
for i in range(len(XNAMELIST)):
    coefIndex_list.append(XNAMELIST[i])
    coefIndex_list.append(XNAMELIST[i]+'_t')
    del i
coef = pd.DataFrame(columns=[0], index=coefIndex_list)
coef = coef.reset_index()
coef.columns = ['Var', 'Value']
for var in ['const'] + XNAMELIST:
    coef.loc[coef['Var']==var, 'Value'] = f'{factorModel.params[var]:.2f}'
    coef.loc[coef['Var']==var+'_t', 'Value'] = f'({factorModel.tvalues[var]:.2f})'
    del var
coef.set_index(keys=['Var'], inplace=True)
out_df['EW 4-Factor'] = coef['Value'].copy()

out_df.loc['NumObs', 'EW 4-Factor'] = int(factorModel.nobs)
out_df.loc['AdjR2', 'EW 4-Factor'] = factorModel.rsquared_adj
out_df.loc['AIC', 'EW 4-Factor'] = factorModel.aic
out_df.loc['BIC', 'EW 4-Factor'] = factorModel.bic


# HOUSE CLEAN #
del XNAMELIST, YNAME, factorModel, coefIndex_list, coef


        ##################
        # 6 FACTOR MODEL #
        ##################


# DEFINE FACTOR LIST AND LABEL #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'TSMOM', 'CRDT']
YNAME = 'EXRET_EW'


# OLS FIT #
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
mod_dict['EW 6-Factor'] = factorModel


# OLS RESULTS #
coefIndex_list = ['const', 'const_t']
for i in range(len(XNAMELIST)):
    coefIndex_list.append(XNAMELIST[i])
    coefIndex_list.append(XNAMELIST[i]+'_t')
    del i
coef = pd.DataFrame(columns=[0], index=coefIndex_list)
coef = coef.reset_index()
coef.columns = ['Var', 'Value']
for var in ['const'] + XNAMELIST:
    coef.loc[coef['Var']==var, 'Value'] = f'{factorModel.params[var]:.2f}'
    coef.loc[coef['Var']==var+'_t', 'Value'] = f'({factorModel.tvalues[var]:.2f})'
    del var
coef.set_index(keys=['Var'], inplace=True)
out_df['EW 6-Factor'] = coef['Value'].copy()

out_df.loc['NumObs', 'EW 6-Factor'] = int(factorModel.nobs)
out_df.loc['AdjR2', 'EW 6-Factor'] = factorModel.rsquared_adj
out_df.loc['AIC', 'EW 6-Factor'] = factorModel.aic
out_df.loc['BIC', 'EW 6-Factor'] = factorModel.bic


# HOUSE CLEAN #
del XNAMELIST, YNAME, factorModel, coefIndex_list, coef


        ##################
        # 9 FACTOR MODEL #
        ##################


# DEFINE FACTOR LIST AND LABEL #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
YNAME = 'EXRET_EW'


# OLS FIT #
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
mod_dict['EW 9-Factor'] = factorModel


# OLS RESULTS #
coefIndex_list = ['const', 'const_t']
for i in range(len(XNAMELIST)):
    coefIndex_list.append(XNAMELIST[i])
    coefIndex_list.append(XNAMELIST[i]+'_t')
    del i
coef = pd.DataFrame(columns=[0], index=coefIndex_list)
coef = coef.reset_index()
coef.columns = ['Var', 'Value']
for var in ['const'] + XNAMELIST:
    coef.loc[coef['Var']==var, 'Value'] = f'{factorModel.params[var]:.2f}'
    coef.loc[coef['Var']==var+'_t', 'Value'] = f'({factorModel.tvalues[var]:.2f})'
    del var
coef.set_index(keys=['Var'], inplace=True)
out_df['EW 9-Factor'] = coef['Value'].copy()

out_df.loc['NumObs', 'EW 9-Factor'] = int(factorModel.nobs)
out_df.loc['AdjR2', 'EW 9-Factor'] = factorModel.rsquared_adj
out_df.loc['AIC', 'EW 9-Factor'] = factorModel.aic
out_df.loc['BIC', 'EW 9-Factor'] = factorModel.bic


# HOUSE CLEAN #
del XNAMELIST, YNAME, factorModel, coefIndex_list, coef


        ####################################################
        # DIEBOLD-MARIANO TEST FOR NESTED MODEL COMPARISON #
        ####################################################


# LOOP THROUGH MODELS
for mod1 in ['EW 4-Factor', 'EW 6-Factor', 'EW 9-Factor']:
    
    for mod2 in ['EW 4-Factor', 'EW 6-Factor', 'EW 9-Factor']:
        
        tStat, pValue = DMTest(indExRet['EXRET_EW'], 
                               mod_dict[mod1].fittedvalues, 
                               mod_dict[mod2].fittedvalues, 3)
    
        tStat = f"{tStat: .2f}"
        dm_df.loc[mod2, mod1] = tStat
        
        del mod2, tStat, pValue
    
    del mod1


# HOUSE CLEAN #
del mod_dict


    #################################################
    # VW INDUSTRY PORTFOLIO TIME-SERIES REGRESSIONS #
    #################################################


# MODEL RECEIVING DICTIONARY #
mod_dict = dict()


        ##################
        # 4 FACTOR MODEL #
        ##################


# DEFINE FACTOR LIST AND LABEL #
XNAMELIST = ['BAB', 'LRSK', 'MKT', 'CRDT']
YNAME = 'EXRET_VW'


# OLS FIT #
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
mod_dict['VW 4-Factor'] = factorModel


# OLS RESULTS #
coefIndex_list = ['const', 'const_t']
for i in range(len(XNAMELIST)):
    coefIndex_list.append(XNAMELIST[i])
    coefIndex_list.append(XNAMELIST[i]+'_t')
    del i
coef = pd.DataFrame(columns=[0], index=coefIndex_list)
coef = coef.reset_index()
coef.columns = ['Var', 'Value']
for var in ['const'] + XNAMELIST:
    coef.loc[coef['Var']==var, 'Value'] = f'{factorModel.params[var]:.2f}'
    coef.loc[coef['Var']==var+'_t', 'Value'] = f'({factorModel.tvalues[var]:.2f})'
    del var
coef.set_index(keys=['Var'], inplace=True)
out_df['VW 4-Factor'] = coef['Value'].copy()

out_df.loc['NumObs', 'VW 4-Factor'] = int(factorModel.nobs)
out_df.loc['AdjR2', 'VW 4-Factor'] = factorModel.rsquared_adj
out_df.loc['AIC', 'VW 4-Factor'] = factorModel.aic
out_df.loc['BIC', 'VW 4-Factor'] = factorModel.bic


# HOUSE CLEAN #
del XNAMELIST, YNAME, factorModel, coefIndex_list, coef


        ##################
        # 6 FACTOR MODEL #
        ##################


# DEFINE FACTOR LIST AND LABEL #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'TSMOM', 'CRDT']
YNAME = 'EXRET_VW'


# OLS FIT #
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
mod_dict['VW 6-Factor'] = factorModel


# OLS RESULTS #
coefIndex_list = ['const', 'const_t']
for i in range(len(XNAMELIST)):
    coefIndex_list.append(XNAMELIST[i])
    coefIndex_list.append(XNAMELIST[i]+'_t')
    del i
coef = pd.DataFrame(columns=[0], index=coefIndex_list)
coef = coef.reset_index()
coef.columns = ['Var', 'Value']
for var in ['const'] + XNAMELIST:
    coef.loc[coef['Var']==var, 'Value'] = f'{factorModel.params[var]:.2f}'
    coef.loc[coef['Var']==var+'_t', 'Value'] = f'({factorModel.tvalues[var]:.2f})'
    del var
coef.set_index(keys=['Var'], inplace=True)
out_df['VW 6-Factor'] = coef['Value'].copy()

out_df.loc['NumObs', 'VW 6-Factor'] = int(factorModel.nobs)
out_df.loc['AdjR2', 'VW 6-Factor'] = factorModel.rsquared_adj
out_df.loc['AIC', 'VW 6-Factor'] = factorModel.aic
out_df.loc['BIC', 'VW 6-Factor'] = factorModel.bic


# HOUSE CLEAN #
del XNAMELIST, YNAME, factorModel, coefIndex_list, coef


        ##################
        # 9 FACTOR MODEL #
        ##################


# DEFINE FACTOR LIST AND LABEL #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
YNAME = 'EXRET_VW'


# OLS FIT #
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
mod_dict['VW 9-Factor'] = factorModel


# OLS RESULTS #
coefIndex_list = ['const', 'const_t']
for i in range(len(XNAMELIST)):
    coefIndex_list.append(XNAMELIST[i])
    coefIndex_list.append(XNAMELIST[i]+'_t')
    del i
coef = pd.DataFrame(columns=[0], index=coefIndex_list)
coef = coef.reset_index()
coef.columns = ['Var', 'Value']
for var in ['const'] + XNAMELIST:
    coef.loc[coef['Var']==var, 'Value'] = f'{factorModel.params[var]:.2f}'
    coef.loc[coef['Var']==var+'_t', 'Value'] = f'({factorModel.tvalues[var]:.2f})'
    del var
coef.set_index(keys=['Var'], inplace=True)
out_df['VW 9-Factor'] = coef['Value'].copy()

out_df.loc['NumObs', 'VW 9-Factor'] = int(factorModel.nobs)
out_df.loc['AdjR2', 'VW 9-Factor'] = factorModel.rsquared_adj
out_df.loc['AIC', 'VW 9-Factor'] = factorModel.aic
out_df.loc['BIC', 'VW 9-Factor'] = factorModel.bic


# HOUSE CLEAN #
del XNAMELIST, YNAME, factorModel, coefIndex_list, coef


        ####################################################
        # DIEBOLD-MARIANO TEST FOR NESTED MODEL COMPARISON #
        ####################################################


# LOOP THROUGH MODELS
for mod1 in ['VW 4-Factor', 'VW 6-Factor', 'VW 9-Factor']:
    
    for mod2 in ['VW 4-Factor', 'VW 6-Factor', 'VW 9-Factor']:
        
        tStat, pValue = DMTest(indExRet['EXRET_VW'], 
                               mod_dict[mod1].fittedvalues, 
                               mod_dict[mod2].fittedvalues, 3)
    
        tStat = f"{tStat: .2f}"
        dm_df.loc[mod2, mod1] = tStat
        
        del mod2, tStat, pValue
    
    del mod1


# HOUSE CLEAN #
del mod_dict


    ##################
    # OUTPUT RESULTS #
    ##################


# OUTPUT RESULT TABLE #
dm_df1 = dm_df.loc[['EW 6-Factor', 'EW 9-Factor'], 
                   ['EW 4-Factor', 'EW 6-Factor']].copy()
dm_df1.index = ['6-Factor DMT', '9-Factor DMT']
dm_df1.columns = ['EW 4-Factor', 'EW 6-Factor']

dm_df2 = dm_df.loc[['VW 6-Factor', 'VW 9-Factor'], 
                   ['VW 4-Factor', 'VW 6-Factor']].copy()
dm_df2.index = ['6-Factor DMT', '9-Factor DMT']
dm_df2.columns = ['VW 4-Factor', 'VW 6-Factor']

dm_df = pd.concat([dm_df1, dm_df2], axis=1)
del dm_df1, dm_df2

out_df = pd.concat([out_df, dm_df])
out_df.to_excel(tabDir + '\\Table4.xlsx')
del dm_df, out_df, indExRet






############################
#         FIGURE 1         # 
############################


# IMPORT PCA FUNCTION #
from sklearn.decomposition import PCA


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# RECEIVING DATA FRAME #
out_df = pd.DataFrame(index=range(25), columns=np.arange(1997, 2018, 2))


# LOOP THROUGH YEARS #
for year in np.arange(1997, 2018, 2):
    
    
    # DISPLAY STATUS TO USER #
    print(f"{year} - {year+1}", end=", ")
    
    
    # GET RETURNS WITHIN WINDOW #
    hfExRet_window = hfExRet_df.loc[(hfExRet_df['MONTH']>=year*100+1) & (hfExRet_df['MONTH']<=(year+1)*100+12), :].copy()
    
    
    # PIVOT RETURN TABLE #
    hfExRet_window = pd.pivot_table(hfExRet_window, values='EXRET', 
                                    index='MONTH', columns="FUNDID")
    
    
    # DROP FUND WITHOUT FULL HISTORY #
    hfExRet_window.dropna(axis=1, inplace=True)
    
    
    # CHECK NUMBER OF FUNDS IN WINDOW #
    numFunds = hfExRet_window.shape[1]
    out_df.loc[0, year] = numFunds
    print(f"{numFunds} funds in window", end=", ")
    
    
    # SKIP WINDOW WITH LESS THAN 60 FUNDS #
    if numFunds < 60:
        print("Skip.")
        continue
    
    
    # PCA ON FUND RETURN MATRIX #
    pca = PCA(n_components=24)
    pca.fit(hfExRet_window)
    
    
    # CUMULTATIVE VARIANCE EXPLAINED RATIOS #
    out_df.loc[1:24, year] = pca.explained_variance_ratio_
    print("PCA done.")
    
    
    # HOUSE CLEAN #
    del year, hfExRet_window, numFunds, pca 
    
    
# PLOT DATA PREPARATION #
plot_df = pd.DataFrame(out_df.iloc[1:,].mean(axis=1) * 100, 
                       index=np.arange(1, 25), 
                       columns=['Variance Explained Ratio'])


# PLOT FOR VARIANCE EXPLAINED #
import matplotlib.pyplot as plt

font = {'size'   : 16}

matplotlib.rc('font', **font)

fig, ax = plt.subplots(figsize=(12, 8))
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

fig = plt.bar(range(1,len(plot_df)+1), plot_df['Variance Explained Ratio'], 
              alpha=0.5, align='center')

plt.xticks(np.arange(5, 26, 5))
plt.yticks([0,10,20,30,40,50])
plt.xlim(left=0.5, right=24)
plt.ylim(bottom=0, top=50)
plt.ylabel('Variance Explained Ratio (in %)', fontsize=20)
    
plt.axvline(x=5, color='grey', linewidth=0.1)
plt.axvline(x=10, color='grey', linewidth=0.1)
plt.axvline(x=15, color='grey', linewidth=0.1)
plt.axvline(x=20, color='grey', linewidth=0.1)
    
plt.axhline(y=10, color='grey', linewidth=0.1)
plt.axhline(y=20, color='grey', linewidth=0.1)
plt.axhline(y=30, color='grey', linewidth=0.1)
plt.axhline(y=40, color='grey', linewidth=0.1)

plt.savefig(figDir+'\\Figure1.png')
plt.close()


# HOUSE CLEAN #
del ax, fig, font, hfExRet_df, out_df, plot_df






############################
#         TABLE 5          # 
############################


# IMPORT FAMA-FRENCH FIVE FACTORS #
ff5factor_df = pd.read_excel('MONTH_FF5F.xlsx')
factor_df = pd.merge(left=factor_df, right=ff5factor_df, how='left', on=['MONTH'])
del ff5factor_df


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET', 'CAT', 'AUM']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# REPLACE ZERO AUM WITH MISSING #
hfExRet_df['AUM'].replace(0, np.NaN, inplace=True)


    #########################
    # FUNCTION FOR GRS TEST #
    #########################


# DEFINE FUNCTION #
def grsTestPvalue(alphas, factors, epsilons):
    '''
    

    Parameters
    ----------
    alphas : pd.Series
        list of alpha estimations for N testing assets.
    factors : pd.DataFrame
        Data Frame of factors over months.
    epsilons : pd.DataFrame
        Data Frame of error estimations over months across testing assets.

    Returns
    -------
    fValue : float
        F-statistics
    pValue : float
        P-value of GRS F-test.

    '''
    
    
    # GET CONSTANT #
    T = epsilons.shape[0]
    N = len(alphas)
    K = factors.shape[1]
    
    
    # LEFT-PART a #
    a = (T - N - K) / N
    
    
    # MIDDLE-PART b #
    E_f = factors.mean()
    omega_hat = factors.cov()
    
    b = 1 / ( 1 + E_f.T @ np.linalg.inv(omega_hat) @ E_f ) 
    
    
    # RIGHT-PART c #
    sigma_hat = epsilons.cov()
    
    c = alphas.T @ np.linalg.inv(sigma_hat) @ alphas
    
    
    # MULTIPLY THREE PARTS AND CALCULATE P-VALUE #
    fValue = a * b * c
    
    from scipy.stats import f
    pValue = 1- f.cdf (fValue, N, T-N-K)
    
    return fValue, pValue


    ##################################
    # HEDGE FUND INDUSTRY PORTFOLIOS #
    ##################################


# RECEIVING DATA FRAMES #
indOut_df = pd.DataFrame(columns=['HF9 Alpha', 'HF9 Adj-R2',
                                  'CAPM Alpha', 'CAPM Adj-R2',
                                  'FF5 Alpha', 'FF5 Adj-R2',
                                  'FH7 Alpha', 'FH7 Adj-R2'], 
                         index= ['EW Est', 'EW T-stat', 'VW Est', 'VW T-stat'])


# IF MISSING AUM, WEIGHTS SET AS ZERO TO EXCLUDE RETURNS #
hfExRet_df['AUM'].fillna(0, inplace=True)


# CONSTRUCT EW AND VW HEDGE FUND INDUSTRY RETURNS #
indExRet = pd.concat([hfExRet_df.groupby(by=['MONTH'])['EXRET'].mean(), 
                      hfExRet_df.groupby(by=['MONTH']).apply(lambda x: np.average(x['EXRET'], weights=x['AUM']))], axis=1)
indExRet.columns = ['EXRET_EW','EXRET_VW']
indExRet = indExRet.reset_index()


# RESTORE AUM MISSING #
hfExRet_df['AUM'].replace(0, np.NaN, inplace=True)


# MERGE WITH FACTOR TABLE #
indExRet = pd.merge(left=indExRet, right=factor_df, how='left', on=['MONTH'])


        ################################################
        # EW INDUSTRY PORTFOLIO TIME-SERIES REGRESSION #
        ################################################
    
    
# DEFINE DEPENDENT VARIABLE #
YNAME = 'EXRET_EW'


# 9-FACTOR MODEL #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['EW Est', ['HF9 Alpha', 'HF9 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['EW T-stat', 'HF9 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# CAPM MODEL #
XNAMELIST = ['MKT']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['EW Est', ['CAPM Alpha', 'CAPM Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['EW T-stat', 'CAPM Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# FANA FRENCH 5-FACTOR MODEL #
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['EW Est', ['FF5 Alpha', 'FF5 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['EW T-stat', 'FF5 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# FUNG-HSIEH 7-FACTOR MODEL #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['EW Est', ['FH7 Alpha', 'FH7 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['EW T-stat', 'FH7 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# HOUSE CLEAN #
del YNAME


        ################################################
        # VW INDUSTRY PORTFOLIO TIME-SERIES REGRESSION #
        ################################################
    
    
# DEFINE DEPENDENT VARIABLE #
YNAME = 'EXRET_VW'


# 9-FACTOR MODEL #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['VW Est', ['HF9 Alpha', 'HF9 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['VW T-stat', 'HF9 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# CAPM MODEL #
XNAMELIST = ['MKT']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['VW Est', ['CAPM Alpha', 'CAPM Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['VW T-stat', 'CAPM Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# FANA FRENCH 5-FACTOR MODEL #
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['VW Est', ['FF5 Alpha', 'FF5 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['VW T-stat', 'FF5 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# FUNG-HSIEH 7-FACTOR MODEL #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['VW Est', ['FH7 Alpha', 'FH7 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['VW T-stat', 'FH7 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# HOUSE CLEAN #
del YNAME


        ##################################
        # HOUSE CLEAN FOR INDUSTRY LEVEL #
        ##################################
    

# HOUSE CLEAN #
del indExRet


    #####################################################
    # EQUAL-WEIGHTED STRATEGY-CATEGORY-BASED PORTFOLIOS #
    #####################################################


# RECEIVING DATA FRAMES #
ewCatOut_df = pd.DataFrame(columns=['HF9 Alpha', 'HF9 Adj-R2',
                                    'CAPM Alpha', 'CAPM Adj-R2',
                                    'FF5 Alpha', 'FF5 Adj-R2',
                                    'FH7 Alpha', 'FH7 Adj-R2'], 
                         index= ['EW CA Est', 'EW CA T-stat', 
                                 'EW DSB Est', 'EW DSB T-stat',
                                 'EW ED Est', 'EW ED T-stat', 
                                 'EW EM Est', 'EW EM T-stat',
                                 'EW EMN Est', 'EW EMN T-stat', 
                                 'EW FIA Est', 'EW FIA T-stat',
                                 'EW FOF Est', 'EW FOF T-stat', 
                                 'EW GM Est', 'EW GM T-stat',
                                 'EW LSEH Est', 'EW LSEH T-stat', 
                                 'EW MS Est', 'EW MS T-stat',
                                 'Average |Alpha|', 'Average Adj-R2', 'F-GRS'])


# CONSTRUCT CATEGORY HEDGE FUND EXCESS RETURNS #
catExRet_df = pd.concat([hfExRet_df.groupby(by=['CAT','MONTH'])['EXRET'].mean(), 
                         hfExRet_df.groupby(by=['CAT','MONTH'])['EXRET'].count()], 
                        axis=1)
catExRet_df.columns = ['EXRET','NUMFUNDS']
catExRet_df = catExRet_df.reset_index()
# 2,697 category-month observations; 10 unique categories
catList = list(catExRet_df['CAT'].unique())


# PIVOT TABLE #
catExRet_df = pd.pivot_table(catExRet_df, values='EXRET', 
                             index='MONTH', columns='CAT')
catExRet_df = catExRet_df.reset_index()

# MERGE WITH FACTOR TABLE #
catExRet_df = pd.merge(left=catExRet_df, right=factor_df, 
                       how='left', on=['MONTH'])


        ##################
        # 9-FACTOR MODEL #
        ##################


# DEFINE X #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    ewCatOut_df.loc[f'EW {cat} Est', ['HF9 Alpha', 'HF9 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    ewCatOut_df.loc[f'EW {cat} T-stat', 'HF9 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
ewCatOut_df.loc['Average |Alpha|', 'HF9 Alpha'] = np.abs(alpha_df['Alpha']).mean()
ewCatOut_df.loc['Average Adj-R2', 'HF9 Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
ewCatOut_df.loc['F-GRS', 'HF9 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ##############
        # CAPM MODEL #
        ##############


# DEFINE X #
XNAMELIST = ['MKT']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    ewCatOut_df.loc[f'EW {cat} Est', ['CAPM Alpha', 'CAPM Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    ewCatOut_df.loc[f'EW {cat} T-stat', 'CAPM Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
ewCatOut_df.loc['Average |Alpha|', 'CAPM Alpha'] = np.abs(alpha_df['Alpha']).mean()
ewCatOut_df.loc['Average Adj-R2', 'CAPM Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
ewCatOut_df.loc['F-GRS', 'CAPM Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ##############################
        # FANA FRENCH 5-FACTOR MODEL #
        ##############################


# DEFINE X #
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    ewCatOut_df.loc[f'EW {cat} Est', ['FF5 Alpha', 'FF5 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    ewCatOut_df.loc[f'EW {cat} T-stat', 'FF5 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
ewCatOut_df.loc['Average |Alpha|', 'FF5 Alpha'] = np.abs(alpha_df['Alpha']).mean()
ewCatOut_df.loc['Average Adj-R2', 'FF5 Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
ewCatOut_df.loc['F-GRS', 'FF5 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        #############################
        # FUNG-HSIEH 7-FACTOR MODEL #
        #############################


# DEFINE X #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    ewCatOut_df.loc[f'EW {cat} Est', ['FH7 Alpha', 'FH7 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    ewCatOut_df.loc[f'EW {cat} T-stat', 'FH7 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
ewCatOut_df.loc['Average |Alpha|', 'FH7 Alpha'] = np.abs(alpha_df['Alpha']).mean()
ewCatOut_df.loc['Average Adj-R2', 'FH7 Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
ewCatOut_df.loc['F-GRS', 'FH7 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ###############################
        # HOUSE CLEAN FOR EW CATEGORY #
        ###############################
    

# HOUSE CLEAN #
del catExRet_df, catList


    #####################################################
    # VALUE-WEIGHTED STRATEGY-CATEGORY-BASED PORTFOLIOS #
    #####################################################


# RECEIVING DATA FRAMES #
vwCatOut_df = pd.DataFrame(columns=['HF9 Alpha', 'HF9 Adj-R2',
                                    'CAPM Alpha', 'CAPM Adj-R2',
                                    'FF5 Alpha', 'FF5 Adj-R2',
                                    'FH7 Alpha', 'FH7 Adj-R2'], 
                         index= ['VW CA Est', 'VW CA T-stat', 
                                 'VW DSB Est', 'VW DSB T-stat',
                                 'VW ED Est', 'VW ED T-stat', 
                                 'VW EM Est', 'VW EM T-stat',
                                 'VW EMN Est', 'VW EMN T-stat', 
                                 'VW FIA Est', 'VW FIA T-stat',
                                 'VW FOF Est', 'VW FOF T-stat', 
                                 'VW GM Est', 'VW GM T-stat',
                                 'VW LSEH Est', 'VW LSEH T-stat', 
                                 'VW MS Est', 'VW MS T-stat',
                                 'Average |Alpha|', 'Average Adj-R2', 'F-GRS'])


# IF MISSING AUM, WEIGHTS SET AS ZERO TO EXCLUDE RETURNS #
hfExRet_df['AUM'].fillna(0, inplace=True)


# CONSTRUCT CATEGORY HEDGE FUND EXCESS RETURNS #
catExRet_df = pd.concat([hfExRet_df.groupby(by=['CAT','MONTH']).apply(lambda x: np.average(x['EXRET'], weights=x['AUM'])), 
                         hfExRet_df.groupby(by=['CAT','MONTH'])['AUM'].count()], 
                        axis=1)
catExRet_df.columns = ['EXRET','NUMFUNDS']
catExRet_df = catExRet_df.reset_index()
# 2,697 category-month observations; 10 unique categories
catList = list(catExRet_df['CAT'].unique())


# RESTORE AUM MISSING #
hfExRet_df['AUM'].replace(0, np.NaN, inplace=True)


# PIVOT TABLE #
catExRet_df = pd.pivot_table(catExRet_df, values='EXRET', 
                             index='MONTH', columns='CAT')
catExRet_df = catExRet_df.reset_index()

# MERGE WITH FACTOR TABLE #
catExRet_df = pd.merge(left=catExRet_df, right=factor_df, 
                       how='left', on=['MONTH'])


        ##################
        # 9-FACTOR MODEL #
        ##################


# DEFINE X #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    vwCatOut_df.loc[f'VW {cat} Est', ['HF9 Alpha', 'HF9 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    vwCatOut_df.loc[f'VW {cat} T-stat', 'HF9 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
vwCatOut_df.loc['Average |Alpha|', 'HF9 Alpha'] = np.abs(alpha_df['Alpha']).mean()
vwCatOut_df.loc['Average Adj-R2', 'HF9 Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
vwCatOut_df.loc['F-GRS', 'HF9 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ##############
        # CAPM MODEL #
        ##############


# DEFINE X #
XNAMELIST = ['MKT']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    vwCatOut_df.loc[f'VW {cat} Est', ['CAPM Alpha', 'CAPM Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    vwCatOut_df.loc[f'VW {cat} T-stat', 'CAPM Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
vwCatOut_df.loc['Average |Alpha|', 'CAPM Alpha'] = np.abs(alpha_df['Alpha']).mean()
vwCatOut_df.loc['Average Adj-R2', 'CAPM Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
vwCatOut_df.loc['F-GRS', 'CAPM Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ##############################
        # FANA FRENCH 5-FACTOR MODEL #
        ##############################


# DEFINE X #
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    vwCatOut_df.loc[f'VW {cat} Est', ['FF5 Alpha', 'FF5 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    vwCatOut_df.loc[f'VW {cat} T-stat', 'FF5 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
vwCatOut_df.loc['Average |Alpha|', 'FF5 Alpha'] = np.abs(alpha_df['Alpha']).mean()
vwCatOut_df.loc['Average Adj-R2', 'FF5 Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
vwCatOut_df.loc['F-GRS', 'FF5 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        #############################
        # FUNG-HSIEH 7-FACTOR MODEL #
        #############################


# DEFINE X #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    vwCatOut_df.loc[f'VW {cat} Est', ['FH7 Alpha', 'FH7 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    vwCatOut_df.loc[f'VW {cat} T-stat', 'FH7 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
vwCatOut_df.loc['Average |Alpha|', 'FH7 Alpha'] = np.abs(alpha_df['Alpha']).mean()
vwCatOut_df.loc['Average Adj-R2', 'FH7 Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
vwCatOut_df.loc['F-GRS', 'FH7 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ###############################
        # HOUSE CLEAN FOR VW CATEGORY #
        ###############################
    

# HOUSE CLEAN #
del catExRet_df, catList


    ##################
    # OUTPUT RESULTS #
    ##################


# OUTPUT RESULT TABLE #
out_df = pd.concat([indOut_df, ewCatOut_df, vwCatOut_df])
out_df.to_excel(tabDir + '\\Table5.xlsx')
del indOut_df, ewCatOut_df, vwCatOut_df, factor_df, hfExRet_df, out_df






############################
#         TABLE 6          # 
############################


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
factor_df.drop(columns=['RF'], inplace=True)


# IMPORT FAMA-FRENCH FIVE FACTORS #
ff5factor_df = pd.read_excel('MONTH_FF5F.xlsx')
factor_df = pd.merge(left=factor_df, right=ff5factor_df, how='left', on=['MONTH'])
del ff5factor_df


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET', 'CAT']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# GET STRATEGY CATEGORY MAPPING #
fundCat_df = hfExRet_df.loc[:,['FUNDID', 'CAT']].drop_duplicates(subset=['FUNDID'], keep='last')
fundCat_df = fundCat_df.set_index(keys=['FUNDID'])


    ############################################
    # HEDGE FUND ALPHA DISTRIBUTION ESTIMATION #
    ############################################


# RECEIVING DATA FRAME FOR ALPHAS #
fundList = list(hfExRet_df['FUNDID'].unique())
alpha_df = pd.DataFrame(columns=['HF9', 'CAPM', 'FF5', 'FH7'], index=fundList)
tStat_df = pd.DataFrame(columns=['HF9', 'CAPM', 'FF5', 'FH7'], index=fundList)


# LOOP THROUGH FUNDS #
for fund in fundList:
    
    
    # DISPLAY STATUS TO USER #
    print(f'Hedge Fund: {fund}.')
    
    
    # TIME-SERIES REGRESSION DATA #
    regData = pd.merge(left=hfExRet_df.loc[hfExRet_df['FUNDID']==fund, ['MONTH', 'EXRET']], 
                       right=factor_df, how='left', on='MONTH')
    
    
        #######
        # HF9 #
        #######
    
    
    # DEFINE INDEPENDENT VARIABLE #
    xNameList = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    #print(factorModel.summary())
    alpha_df.loc[fund, 'HF9'] = factorModel.params[0]
    tStat_df.loc[fund, 'HF9'] = factorModel.tvalues[0]
    
    
    # HOUSE CLEAN #
    del xNameList, factorModel
    
    
        ########
        # CAPM #
        ########
    
    
    # DEFINE INDEPENDENT VARIABLE #
    xNameList = ['MKT']
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    #print(factorModel.summary())
    alpha_df.loc[fund, 'CAPM'] = factorModel.params[0]
    tStat_df.loc[fund, 'CAPM'] = factorModel.tvalues[0]
    
    
    # HOUSE CLEAN #
    del xNameList, factorModel
    
    
        #######
        # FF5 #
        #######
    
    
    # DEFINE INDEPENDENT VARIABLE #
    xNameList = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    #print(factorModel.summary())
    alpha_df.loc[fund, 'FF5'] = factorModel.params[0]
    tStat_df.loc[fund, 'FF5'] = factorModel.tvalues[0]
    
    
    # HOUSE CLEAN #
    del xNameList, factorModel
    
    
        #######
        # FH7 #
        #######
    
    
    # DEFINE INDEPENDENT VARIABLE #
    xNameList = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    #print(factorModel.summary())
    alpha_df.loc[fund, 'FH7'] = factorModel.params[0]
    tStat_df.loc[fund, 'FH7'] = factorModel.tvalues[0]
    
    
    # HOUSE CLEAN #
    del xNameList, factorModel
    
    
    # HOUSE CLEAN #
    del fund, regData


# ADD STRATEGY CATEGORY TO TABLE #
alpha_df = pd.merge(left=alpha_df, right=fundCat_df, how='left', 
                    left_index=True, right_index=True)
tStat_df = pd.merge(left=tStat_df, right=fundCat_df, how='left', 
                    left_index=True, right_index=True)


# DATA TYPES #
alpha_df['HF9'] = alpha_df['HF9'].astype(float)
alpha_df['CAPM'] = alpha_df['CAPM'].astype(float)
alpha_df['FF5'] = alpha_df['FF5'].astype(float)
alpha_df['FH7'] = alpha_df['FH7'].astype(float)

tStat_df['HF9'] = tStat_df['HF9'].astype(float)
tStat_df['CAPM'] = tStat_df['CAPM'].astype(float)
tStat_df['FF5'] = tStat_df['FF5'].astype(float)
tStat_df['FH7'] = tStat_df['FH7'].astype(float)


    ##########################################
    # FRACTION OF T-STATISTICS OVER 2 and -2 #
    ##########################################


# CATEGROY LIST #
catList = list(fundCat_df['CAT'].unique())
catList.sort()


# RECEIVING DATA FRAME #
frac_df = pd.DataFrame(columns=['HF9<-2', 'HF9>2', 'CAPM<-2', 'CAPM>2', 
                                'FF5<-2', 'FF5>2', 'FH7<-2', 'FH7>2'], 
                       index=['All Funds']+catList)


# FRACTION FOR ALL FUNDS #
for mod in ['HF9', 'CAPM', 'FF5', 'FH7']:
    
    
    # ALL FUNDS #
    numFunds = tStat_df.shape[0]
    frac_df.loc['All Funds', f'{mod}<-2'] = (tStat_df[mod] < -2 ).sum() / numFunds
    frac_df.loc['All Funds', f'{mod}>2'] = (tStat_df[mod] > 2 ).sum() / numFunds
    del numFunds
    
    
    # CATEGORIES #
    for cat in catList:
        
        
        # EXTRACT NEEDED ROWS #
        tStat_cat = tStat_df.loc[tStat_df['CAT']==cat, :].copy()
        
        
        # FRACTIONS #
        numFunds = tStat_cat.shape[0]
        frac_df.loc[f'{cat}', f'{mod}<-2'] = (tStat_cat[mod] < -2 ).sum() / numFunds
        frac_df.loc[f'{cat}', f'{mod}>2'] = (tStat_cat[mod] > 2 ).sum() / numFunds
        del cat, tStat_cat, numFunds
        
    
    # HOUSE CLEAN #
    del mod


# OUTPUT RESULTS IN PERCENTAGE #
frac_df *= 100
frac_df.to_csv(tabDir + '\\Table6.csv')


# HOUSE CLEAN #
del catList






############################
#         FIGURE 2         # 
############################


# IMPORT PLOT PACKAGE #
font = {'size'   : 16}

matplotlib.rc('font', **font)

fig, ax = plt.subplots(figsize=(12, 8))
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

lss = ['solid', 'dashed', 'dashdot', 'dotted']
for model, ls  in zip(['HF9','CAPM','FF5','FH7'], lss):
    sns.kdeplot(data=alpha_df[model], gridsize=1000, linestyle=ls, 
                linewidth=2, label=model)
    del model, ls
plt.legend(loc="upper left")

plt.xlim(left=-2, right=2)
plt.xlabel(r'$\alpha$ (%)')
plt.ylabel('')

plt.axhline(y=0.2, color='grey', linewidth=0.2)
plt.axhline(y=0.4, color='grey', linewidth=0.2)
plt.axhline(y=0.6, color='grey', linewidth=0.2)
plt.axhline(y=0.8, color='grey', linewidth=0.2)
plt.axhline(y=1, color='grey', linewidth=0.2)

plt.axvline(x=-1.5, color='grey', linewidth=0.2)
plt.axvline(x=-1, color='grey', linewidth=0.2)
plt.axvline(x=-0.5, color='grey', linewidth=0.2)
plt.axvline(x=0.5, color='grey', linewidth=0.2)
plt.axvline(x=1, color='grey', linewidth=0.2)
plt.axvline(x=1.5, color='grey', linewidth=0.2)

plt.axvline(x=0, color='black', linewidth=1)

plt.savefig(figDir+'\\Figure2.png')
plt.close()


# HOUSE CLEAN #
del alpha_df, ax, factor_df, fig, font, frac_df, fundCat_df, fundList, hfExRet_df, lss, tStat_df 






############################
#         TABLE 7          # 
############################


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')


# IMPORT FAMA-FRENCH FIVE FACTORS #
ff5factor_df = pd.read_excel('MONTH_FF5F.xlsx')
factor_df = pd.merge(left=factor_df, right=ff5factor_df, how='left', on=['MONTH'])
del ff5factor_df


# HEDGE FUND INDEX RETURNS #
#hfRet_df = pd.read_excel('BARCLAYHEDGE_INDEXID_INDEXNAME_RETURN_1997_2022_INDEX_SAMPLE.xlsx')
hfRet_df = pd.read_excel('PSEUDODATA2.xlsx')
hfRet_df.set_index(keys=['INDEXID', 'INDEXNAME'], inplace=True)
monthList = []
for year in range(1997, 2023):
    for month in range(1, 13):
        yyyymm = year * 100 + month
        if yyyymm <= 202203:
            monthList.append(yyyymm)
        del month, yyyymm
    del year
hfRet_df.columns = monthList
del monthList
hfRet_df = hfRet_df.stack()
hfRet_df = hfRet_df.reset_index()
hfRet_df.columns = ['INDEXID', 'INDEXNAME', 'MONTH', 'RET']
hfRet_df = hfRet_df.loc[hfRet_df['MONTH']<=201908, :].copy()
# 2,720 index-month observations; 10 unique indeces


# HEDGE FUND INDEX EXCESS RETURN #
hfExRet_df = pd.merge(left=hfRet_df, right=factor_df.loc[:,['MONTH', 'RF']], 
                      how='left', on='MONTH')
hfExRet_df['EXRET'] = hfExRet_df['RET'].copy() - hfExRet_df['RF'].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 3,000 index-month observations; 10 unique indeces
hfExRet_df.drop(columns=['RET','RF'], inplace=True)
del hfRet_df


# DEFINE NUMBER OF INDECES #
NUMPORT = 10
hfExRet_df['INDEXNAME'].replace({'Barclay Hedge Fund Index': 'IND',
                                 'Barclay Convertible Arbitrage Index': 'CA',
                                 'Barclay Event Driven Index': 'ED',
                                 'Barclay Emerging Markets Index': 'EM',
                                 'Barclay Equity Market Neutral Index': 'EMN',
                                 'Barclay Fixed Income Arbitrage Index': 'FIA',
                                 'Barclay Fund of Funds Index': 'FOF',
                                 'Barclay Global Macro Index': 'GM',
                                 'Barclay Equity Long/Short Index': 'LSEH',
                                 'Barclay Multi Strategy Index': 'MS'}, 
                                inplace=True)
indexList = ['IND', 'CA', 'ED', 'EM', 'EMN', 'FIA', 'FOF', 'GM', 'LSEH', 'MS']


    #######################
    # BARCLAY INDEX TESTS #
    #######################


# RECEIVING DATA FRAMES #
idxOut_df = pd.DataFrame(columns=['HF9 Alpha', 'HF9 Adj-R2',
                                    'CAPM Alpha', 'CAPM Adj-R2',
                                    'FF5 Alpha', 'FF5 Adj-R2',
                                    'FH7 Alpha', 'FH7 Adj-R2'], 
                         index= ['IND Est', 'IND T-stat', 
                                 'CA Est', 'CA T-stat', 
                                 'ED Est', 'ED T-stat', 
                                 'EM Est', 'EM T-stat',
                                 'EMN Est', 'EMN T-stat', 
                                 'FIA Est', 'FIA T-stat',
                                 'FOF Est', 'FOF T-stat', 
                                 'GM Est', 'GM T-stat',
                                 'LSEH Est', 'LSEH T-stat', 
                                 'MS Est', 'MS T-stat',
                                 'Average |Alpha|', 'Average Adj-R2', 'F-GRS'])


# PIVOT TABLE #
hfExRet_df = pd.pivot_table(hfExRet_df, values='EXRET', 
                            index='MONTH', columns='INDEXNAME')
hfExRet_df = hfExRet_df.reset_index()

# MERGE WITH FACTOR TABLE #
hfExRet_df = pd.merge(left=hfExRet_df, right=factor_df, 
                      how='left', on=['MONTH'])


        ##################
        # 9-FACTOR MODEL #
        ##################


# DEFINE X #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=indexList)
epsilon_df = pd.DataFrame(columns=indexList, index=hfExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    # TIME-SERIES REGRESSION #
    reg_df = hfExRet_df.loc[:, ['MONTH', index]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    idxOut_df.loc[f'{index} Est', ['HF9 Alpha', 'HF9 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    idxOut_df.loc[f'{index} T-stat', 'HF9 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[index, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[index] = reg_df['resid'].copy()
    del index, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
idxOut_df.loc['Average |Alpha|', 'HF9 Alpha'] = np.abs(alpha_df.loc[indexList[1:],'Alpha']).mean()
idxOut_df.loc['Average Adj-R2', 'HF9 Adj-R2'] = alpha_df.loc[indexList[1:],'Adj-R2'].mean()


# GRS TEST #
alpha_df = alpha_df.loc[alpha_df.index!='IND', 'Alpha'].copy()
epsilon_df.drop(columns=['IND'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
idxOut_df.loc['F-GRS', 'HF9 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ##############
        # CAPM MODEL #
        ##############


# DEFINE X #
XNAMELIST = ['MKT']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=indexList)
epsilon_df = pd.DataFrame(columns=indexList, index=hfExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    # TIME-SERIES REGRESSION #
    reg_df = hfExRet_df.loc[:, ['MONTH', index]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    idxOut_df.loc[f'{index} Est', ['CAPM Alpha', 'CAPM Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    idxOut_df.loc[f'{index} T-stat', 'CAPM Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[index, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[index] = reg_df['resid'].copy()
    del index, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
idxOut_df.loc['Average |Alpha|', 'CAPM Alpha'] = np.abs(alpha_df.loc[indexList[1:],'Alpha']).mean()
idxOut_df.loc['Average Adj-R2', 'CAPM Adj-R2'] = alpha_df.loc[indexList[1:],'Adj-R2'].mean()


# GRS TEST #
alpha_df = alpha_df.loc[alpha_df.index!='IND', 'Alpha'].copy()
epsilon_df.drop(columns=['IND'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
idxOut_df.loc['F-GRS', 'CAPM Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ##############################
        # FANA FRENCH 5-FACTOR MODEL #
        ##############################


# DEFINE X #
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=indexList)
epsilon_df = pd.DataFrame(columns=indexList, index=hfExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    # TIME-SERIES REGRESSION #
    reg_df = hfExRet_df.loc[:, ['MONTH', index]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    idxOut_df.loc[f'{index} Est', ['FF5 Alpha', 'FF5 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    idxOut_df.loc[f'{index} T-stat', 'FF5 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[index, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[index] = reg_df['resid'].copy()
    del index, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
idxOut_df.loc['Average |Alpha|', 'FF5 Alpha'] = np.abs(alpha_df.loc[indexList[1:],'Alpha']).mean()
idxOut_df.loc['Average Adj-R2', 'FF5 Adj-R2'] = alpha_df.loc[indexList[1:],'Adj-R2'].mean()


# GRS TEST #
alpha_df = alpha_df.loc[alpha_df.index!='IND', 'Alpha'].copy()
epsilon_df.drop(columns=['IND'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
idxOut_df.loc['F-GRS', 'FF5 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        #############################
        # FUNG-HSIEH 7-FACTOR MODEL #
        #############################


# DEFINE X #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=indexList)
epsilon_df = pd.DataFrame(columns=indexList, index=hfExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    # TIME-SERIES REGRESSION #
    reg_df = hfExRet_df.loc[:, ['MONTH', index]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    idxOut_df.loc[f'{index} Est', ['FH7 Alpha', 'FH7 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    idxOut_df.loc[f'{index} T-stat', 'FH7 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[index, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[index] = reg_df['resid'].copy()
    del index, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
idxOut_df.loc['Average |Alpha|', 'FH7 Alpha'] = np.abs(alpha_df.loc[indexList[1:],'Alpha']).mean()
idxOut_df.loc['Average Adj-R2', 'FH7 Adj-R2'] = alpha_df.loc[indexList[1:],'Adj-R2'].mean()


# GRS TEST #
alpha_df = alpha_df.loc[alpha_df.index!='IND', 'Alpha'].copy()
epsilon_df.drop(columns=['IND'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
idxOut_df.loc['F-GRS', 'FH7 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


# OUTPUT RESULT TABLE #
idxOut_df.to_excel(tabDir + '\\Table7_PanelA.xlsx')
    

# HOUSE CLEAN #
del hfExRet_df, indexList, idxOut_df, NUMPORT


    ############################################
    # BARCLAY INDIVIDUAL FUND DATA PREPARATION #
    ############################################


# IMPORT FUND NAME TABLE #
#fundList = pd.read_excel('BARCLAYHEDGE_FUNDID_CATEGORY_INCEPTIONDATE_SAMPLE_USD_HF_FOF_NET_CAT.xlsx')
fundList = pd.read_excel('PSEUDODATA3.xlsx')
fundList.columns = ['FUNDID', 'CAT', 'INCEPT']
fundList = fundList.loc[:,['FUNDID', 'CAT']].copy()
catDict = {"Convertible Arbitrage": "CA", 
           "Convertible Arbitrage - Credit": "CA", 
           "Convertible Arbitrage - Volatility": "CA",
           "Equity Dedicated Short": "DSB", 
           "Equity Short-Bias": "DSB", 
           "Equity Short-Bias - Growth Oriented": "DSB", 
           "Equity Short-Bias - Opportunistic": "DSB", 
           "Equity Short-Bias - Quantitative": "DSB",
           "Event Driven": "ED", 
           "Emerging Markets - Asia": "EM", 
           "Emerging Markets - Eastern Europe/CIS": "EM", 
           "Emerging Markets - Global": "EM", 
           "Emerging Markets - Latin America": "EM", 
           "Emerging Markets - MENA": "EM", 
           "Emerging Markets - Sub Saharan": "EM", 
           "Equity Market Neutral": "EMN", 
           "Equity Market Neutral - Quantitative": "EMN", 
           "Equity Market Neutral - Value Oriented": "EMN", 
           "Fixed Income - Arbitrage": "FIA", 
           "Fixed Income - Arbitrage - Capital Structure Arb": "FIA", 
           "Fixed Income - Arbitrage - Credit Default Swaps": "FIA", 
           "Fund of Funds": "FOF",
           "Fund of Funds - Arbitrage": "FOF", 
           "Fund of Funds - Diversified": "FOF", 
           "Macro": "GM", 
           "Macro - Discretionary": "GM", 
           "Macro - Quantitative": "GM", 
           "Equity Long/Short": "LSEH", 
           "Equity Long/Short - Growth Oriented": "LSEH", 
           "Equity Long/Short - Opportunistic": "LSEH", 
           "Equity Long/Short - Quantitative": "LSEH", 
           "Equity Long/Short - Trading Oriented": "LSEH", 
           "Equity Long/Short - Value Oriented": "LSEH", 
           "Multi-Strategy": "MS", 
           "Multi-Strategy -Strategy Arb - Quantitative": "MS"}
fundList.replace({'CAT': catDict}, inplace=True)
# 7,353 funds 


# IMPORT RETURNS #
#hfRawData = pd.read_excel('BARCLAYHEDGE_FUNDID_RETURN_1994_2022_FOF_HF_SAMPLE.xlsx')
hfRawData = pd.read_excel('PSEUDODATA4.xlsx')
monthList = list()
for year in range(1994, 2023):
    for month in range(1, 13):
        yyyymm = year * 100 + month
        if yyyymm <= 202203:
            monthList.append(yyyymm)
        del month, yyyymm
    del year
hfRawData.columns = ["FUNDID"] + monthList
del monthList
# 18,340 funds


# ONLY KEEP SELECTED FUNDS #
hfRawData = hfRawData.loc[hfRawData['FUNDID'].isin(fundList['FUNDID']), :].copy()
# 7,353 funds 


# STARTING FROM 199701 TO 201908 #
hfRawData.set_index(keys='FUNDID', inplace=True)
hfRawData = hfRawData.loc[:, 199701:201908].copy()


# RETURN OUTLIERS #
hfRawData[hfRawData>300] = np.NaN 
hfRawData[hfRawData<-90] = np.NaN 


# STACK MATRIX #
hfRawData = hfRawData.stack().reset_index()
hfRawData.columns = ['FUNDID', 'MONTH', 'RET']
# 543,564 fund-month observations; 7,319 unique funds


# HEDGE FUND EXCESS RETURNS #
hfRawData = pd.merge(left=hfRawData, right=factor_df.loc[:,['MONTH', 'RF']], how='left', on=['MONTH'])
hfRawData['EXRET'] =  hfRawData['RET'].copy() - hfRawData['RF'].copy()
hfRawData.drop(columns=['RET', 'RF'], inplace=True)


# REQUIRE AT LEAST 48 MONTHS OF RETURNS #
hfRawData.sort_values(by=['FUNDID', 'MONTH'], inplace=True)
monthCount = hfRawData['FUNDID'].value_counts()
monthCount = list(monthCount[monthCount>=48].copy().index)
monthCount.sort()
hfRawData = hfRawData.loc[hfRawData['FUNDID'].isin(monthCount), :].copy()
# 467,790 fund-month observations; 4,569 unique funds
del monthCount


# ADD CAT TO RETERN TABLE #
hfExRet_df = pd.merge(left=hfRawData, right=fundList, how='left', on='FUNDID')
del hfRawData, catDict, fundList
hfExRet_df.drop_duplicates(subset=['FUNDID'])['CAT'].value_counts()


# GET STRATEGY CATEGORY MAPPING #
fundCat_df = hfExRet_df.loc[:,['FUNDID', 'CAT']].drop_duplicates(subset=['FUNDID'], keep='last')
fundCat_df = fundCat_df.set_index(keys=['FUNDID'])


    ############################################
    # HEDGE FUND ALPHA DISTRIBUTION ESTIMATION #
    ############################################


# RECEIVING DATA FRAME FOR ALPHAS #
fundList = list(hfExRet_df['FUNDID'].unique())
alpha_df = pd.DataFrame(columns=['HF9', 'CAPM', 'FF5', 'FH7'], index=fundList)
tStat_df = pd.DataFrame(columns=['HF9', 'CAPM', 'FF5', 'FH7'], index=fundList)


# LOOP THROUGH FUNDS #
for fund in fundList:
    
    
    # DISPLAY STATUS TO USER #
    print(f'Hedge Fund: {fund}.')
    
    
    # TIME-SERIES REGRESSION DATA #
    regData = pd.merge(left=hfExRet_df.loc[hfExRet_df['FUNDID']==fund, ['MONTH', 'EXRET']], 
                       right=factor_df, how='left', on='MONTH')
    
    
        #######
        # HF9 #
        #######
    
    
    # DEFINE INDEPENDENT VARIABLE #
    xNameList = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    alpha_df.loc[fund, 'HF9'] = factorModel.params[0]
    tStat_df.loc[fund, 'HF9'] = factorModel.tvalues[0]
    
    
    # HOUSE CLEAN #
    del xNameList, factorModel
    
    
        ########
        # CAPM #
        ########
    
    
    # DEFINE INDEPENDENT VARIABLE #
    xNameList = ['MKT']
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    alpha_df.loc[fund, 'CAPM'] = factorModel.params[0]
    tStat_df.loc[fund, 'CAPM'] = factorModel.tvalues[0]
    
    
    # HOUSE CLEAN #
    del xNameList, factorModel
    
    
        #######
        # FF5 #
        #######
    
    
    # DEFINE INDEPENDENT VARIABLE #
    xNameList = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    alpha_df.loc[fund, 'FF5'] = factorModel.params[0]
    tStat_df.loc[fund, 'FF5'] = factorModel.tvalues[0]
    
    
    # HOUSE CLEAN #
    del xNameList, factorModel
    
    
        #######
        # FH7 #
        #######
    
    
    # DEFINE INDEPENDENT VARIABLE #
    xNameList = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    alpha_df.loc[fund, 'FH7'] = factorModel.params[0]
    tStat_df.loc[fund, 'FH7'] = factorModel.tvalues[0]
    
    
    # HOUSE CLEAN #
    del xNameList, factorModel
    
    
    # HOUSE CLEAN #
    del fund, regData


# ADD STRATEGY CATEGORY TO TABLE #
alpha_df = pd.merge(left=alpha_df, right=fundCat_df, how='left', 
                    left_index=True, right_index=True)
tStat_df = pd.merge(left=tStat_df, right=fundCat_df, how='left', 
                    left_index=True, right_index=True)


# DATA TYPES #
alpha_df['HF9'] = alpha_df['HF9'].astype(float)
alpha_df['CAPM'] = alpha_df['CAPM'].astype(float)
alpha_df['FF5'] = alpha_df['FF5'].astype(float)
alpha_df['FH7'] = alpha_df['FH7'].astype(float)

tStat_df['HF9'] = tStat_df['HF9'].astype(float)
tStat_df['CAPM'] = tStat_df['CAPM'].astype(float)
tStat_df['FF5'] = tStat_df['FF5'].astype(float)
tStat_df['FH7'] = tStat_df['FH7'].astype(float)


    ##########################################
    # FRACTION OF T-STATISTICS OVER 2 and -2 #
    ##########################################


# CATEGROY LIST #
catList = list(fundCat_df['CAT'].unique())
catList.sort()


# RECEIVING DATA FRAME #
frac_df = pd.DataFrame(columns=['HF9<-2', 'HF9>2', 'CAPM<-2', 'CAPM>2', 
                                'FF5<-2', 'FF5>2', 'FH7<-2', 'FH7>2'], 
                       index=['All Funds']+catList)


# FRACTION FOR ALL FUNDS #
for mod in ['HF9', 'CAPM', 'FF5', 'FH7']:
    
    
    # ALL FUNDS #
    numFunds = tStat_df.shape[0]
    frac_df.loc['All Funds', f'{mod}<-2'] = (tStat_df[mod] < -2 ).sum() / numFunds
    frac_df.loc['All Funds', f'{mod}>2'] = (tStat_df[mod] > 2 ).sum() / numFunds
    del numFunds
    
    
    # CATEGORIES #
    for cat in catList:
        
        
        # EXTRACT NEEDED ROWS #
        tStat_cat = tStat_df.loc[tStat_df['CAT']==cat, :].copy()
        
        
        # FRACTIONS #
        numFunds = tStat_cat.shape[0]
        frac_df.loc[f'{cat}', f'{mod}<-2'] = (tStat_cat[mod] < -2 ).sum() / numFunds
        frac_df.loc[f'{cat}', f'{mod}>2'] = (tStat_cat[mod] > 2 ).sum() / numFunds
        del cat, tStat_cat, numFunds
        
    
    # HOUSE CLEAN #
    del mod


# OUTPUT RESULTS IN PERCENTAGE #
frac_df *= 100
frac_df.to_csv(tabDir + '\\Table7_PanelB.csv')


# HOUSE CLEAN #
del catList






############################
#         FIGURE 3         # 
############################


# DENSITY PLOT #
font = {'size'   : 16}

matplotlib.rc('font', **font)

fig, ax = plt.subplots(figsize=(12, 8))
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

lss = ['solid', 'dashed', 'dashdot', 'dotted']
for model, ls  in zip(['HF9','CAPM','FF5','FH7'], lss):
    sns.kdeplot(data=alpha_df[model], gridsize=1000, linestyle=ls, 
                linewidth=2, label=model)
    del model, ls
plt.legend(loc="upper left")

plt.xlim(left=-2, right=2)
plt.xlabel(r'$\alpha$ (%)')
plt.ylabel('')

plt.axhline(y=0.2, color='grey', linewidth=0.2)
plt.axhline(y=0.4, color='grey', linewidth=0.2)
plt.axhline(y=0.6, color='grey', linewidth=0.2)
plt.axhline(y=0.8, color='grey', linewidth=0.2)
plt.axhline(y=1, color='grey', linewidth=0.2)

plt.axvline(x=-1.5, color='grey', linewidth=0.2)
plt.axvline(x=-1, color='grey', linewidth=0.2)
plt.axvline(x=-0.5, color='grey', linewidth=0.2)
plt.axvline(x=0.5, color='grey', linewidth=0.2)
plt.axvline(x=1, color='grey', linewidth=0.2)
plt.axvline(x=1.5, color='grey', linewidth=0.2)

plt.axvline(x=0, color='black', linewidth=1)

plt.savefig(figDir+'\\Figure3.png')
plt.close()


# HOUSE CLEAN #
del alpha_df, ax, factor_df, fig, font, frac_df, fundCat_df, fundList, hfExRet_df, lss, tStat_df 






############################
#         TABLE 8          # 
############################


# IMPORT FUNCTION #
from sklearn.metrics import mean_squared_error


# PRE-SPECIFY ESTIMATION WINDOW SIZE #
NUMYEAR_TRAIN = 5


# RECEIVING DATA FRAME #
mse_df = pd.DataFrame(columns=['HF9', 'CAPM', 'FF5', 'FH7'], 
                      index=['TASS IND', 'Barclay IND', 
                             'TASS CA', 'TASS ED', 'TASS EM', 'TASS EMN', 
                             'TASS FIA', 'TASS FOF', 'TASS GM', 'TASS LSEH', 
                             'TASS MS', 
                             'Barclay CA', 'Barclay ED', 'Barclay EM', 'Barclay EMN', 
                             'Barclay FIA', 'Barclay FOF', 'Barclay GM', 'Barclay LSEH', 
                             'Barclay MS', ])


    ###########################
    # TASS OUT-OF-SAMPLE TEST #
    ###########################


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')


# IMPORT FAMA-FRENCH FIVE FACTORS #
ff5factor_df = pd.read_excel('MONTH_FF5F.xlsx')
factor_df = pd.merge(left=factor_df, right=ff5factor_df, how='left', on=['MONTH'])
del ff5factor_df


# OUT-OF-SAMPLE FACTORS #
oosFactor_df = pd.read_excel('MONTH_FACTOR_201909_202112.xlsx')


# COMBINE FACTORS #
factor_df = factor_df.loc[:, oosFactor_df.columns].copy()
factor_df = pd.concat([factor_df, oosFactor_df], axis=0, ignore_index=True)
del oosFactor_df
factor_df.drop_duplicates(subset=['MONTH'], inplace=True, ignore_index=True)


# HEDGE FUND INDEX EXCESS RETURNS #
#hfExRet_df = pd.read_csv('TASS_PORTFOLIO_EXRET_1997_2021_OUT_OF_SAMPLE.csv')
hfExRet_df = pd.read_csv('PSEUDODATA5.csv')


# END WITH 202008 #
hfExRet_df = hfExRet_df.loc[hfExRet_df['MONTH']<=202012, :].copy()


# DEFINE NUMBER OF INDECES #
indexList = hfExRet_df.columns[1:]


# MERGE WITH FACTOR TABLE #
hfExRet_df = pd.merge(left=hfExRet_df, right=factor_df, 
                      how='left', on=['MONTH'])
del factor_df


        ##################
        # 9-FACTOR MODEL #
        ##################


# DEFINE X #
MOD = 'HF9'
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    
    # RECEIVING DATA FRAME FOR PREDICTED VALUES #
    pred_df = hfExRet_df.loc[hfExRet_df['MONTH']>=201909,['MONTH', YNAME]].copy()
    pred_df.columns = ['MONTH','y_true']
    
    
    # TIME-SERIES REGRESSION #
    trainSet = hfExRet_df.loc[(hfExRet_df['MONTH']>=(2019-NUMYEAR_TRAIN)*100+9)&(hfExRet_df['MONTH']<=201908), 
                              ['MONTH', index]+XNAMELIST].copy()
    trainSet.dropna(inplace=True)
    factorModel = sm.OLS(trainSet[YNAME], sm.add_constant(trainSet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    del trainSet
    
    # PREDICTION #
    pred_df['y_pred'] = factorModel.predict(sm.add_constant(hfExRet_df.loc[hfExRet_df['MONTH']>=201909,XNAMELIST]))
    del factorModel
    
    
    # MSE #
    mse_df.loc[f'TASS {index}', MOD] = mean_squared_error(pred_df['y_true'], pred_df['y_pred'])
    
    
    # HOUSE CLEAN #
    del index, YNAME, pred_df 


# HOUSE CLEAN #
del MOD, XNAMELIST


        ##############
        # CAPM MODEL #
        ##############


# DEFINE X #
MOD = 'CAPM'
XNAMELIST = ['MKT']


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    
    # RECEIVING DATA FRAME FOR PREDICTED VALUES #
    pred_df = hfExRet_df.loc[hfExRet_df['MONTH']>=201909,['MONTH', YNAME]].copy()
    pred_df.columns = ['MONTH','y_true']
    
    
    # TIME-SERIES REGRESSION #
    trainSet = hfExRet_df.loc[(hfExRet_df['MONTH']>=(2019-NUMYEAR_TRAIN)*100+9)&(hfExRet_df['MONTH']<=201908), 
                              ['MONTH', index]+XNAMELIST].copy()
    trainSet.dropna(inplace=True)
    factorModel = sm.OLS(trainSet[YNAME], sm.add_constant(trainSet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    del trainSet
    
    # PREDICTION #
    pred_df['y_pred'] = factorModel.predict(sm.add_constant(hfExRet_df.loc[hfExRet_df['MONTH']>=201909,XNAMELIST]))
    del factorModel
    
    
    # MSE #
    mse_df.loc[f'TASS {index}', MOD] = mean_squared_error(pred_df['y_true'], pred_df['y_pred'])
    
    
    # HOUSE CLEAN #
    del index, YNAME, pred_df 


# HOUSE CLEAN #
del MOD, XNAMELIST


        ##############################
        # FANA FRENCH 5-FACTOR MODEL #
        ##############################


# DEFINE X #
MOD = 'FF5'
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    
    # RECEIVING DATA FRAME FOR PREDICTED VALUES #
    pred_df = hfExRet_df.loc[hfExRet_df['MONTH']>=201909,['MONTH', YNAME]].copy()
    pred_df.columns = ['MONTH','y_true']
    
    
    # TIME-SERIES REGRESSION #
    trainSet = hfExRet_df.loc[(hfExRet_df['MONTH']>=(2019-NUMYEAR_TRAIN)*100+9)&(hfExRet_df['MONTH']<=201908), 
                              ['MONTH', index]+XNAMELIST].copy()
    trainSet.dropna(inplace=True)
    factorModel = sm.OLS(trainSet[YNAME], sm.add_constant(trainSet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    del trainSet
    
    # PREDICTION #
    pred_df['y_pred'] = factorModel.predict(sm.add_constant(hfExRet_df.loc[hfExRet_df['MONTH']>=201909,XNAMELIST]))
    del factorModel
    
    
    # MSE #
    mse_df.loc[f'TASS {index}', MOD] = mean_squared_error(pred_df['y_true'], pred_df['y_pred'])
    
    
    # HOUSE CLEAN #
    del index, YNAME, pred_df 


# HOUSE CLEAN #
del MOD, XNAMELIST


        #############################
        # FUNG-HSIEH 7-FACTOR MODEL #
        #############################


# DEFINE X #
MOD = 'FH7'
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    
    # RECEIVING DATA FRAME FOR PREDICTED VALUES #
    pred_df = hfExRet_df.loc[hfExRet_df['MONTH']>=201909,['MONTH', YNAME]].copy()
    pred_df.columns = ['MONTH','y_true']
    
    
    # TIME-SERIES REGRESSION #
    trainSet = hfExRet_df.loc[(hfExRet_df['MONTH']>=(2019-NUMYEAR_TRAIN)*100+9)&(hfExRet_df['MONTH']<=201908), 
                              ['MONTH', index]+XNAMELIST].copy()
    trainSet.dropna(inplace=True)
    factorModel = sm.OLS(trainSet[YNAME], sm.add_constant(trainSet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    del trainSet
    
    # PREDICTION #
    pred_df['y_pred'] = factorModel.predict(sm.add_constant(hfExRet_df.loc[hfExRet_df['MONTH']>=201909,XNAMELIST]))
    del factorModel
    
    
    # MSE #
    mse_df.loc[f'TASS {index}', MOD] = mean_squared_error(pred_df['y_true'], pred_df['y_pred'])
    
    
    # HOUSE CLEAN #
    del index, YNAME, pred_df 


# HOUSE CLEAN #
del MOD, XNAMELIST


        ###############
        # HOUSE CLEAN #
        ###############


# HOUSE CLEAN #
del hfExRet_df, indexList


    ##############################
    # BARCLAY OUT-OF-SAMPLE TEST #
    ##############################


# IMPORT FACTORS # 
factor_df = pd.read_excel('Month_Factor.xlsx')


# IMPORT FAMA-FRENCH FIVE FACTORS #
ff5factor_df = pd.read_excel('Month_FF5F.xlsx')
factor_df = pd.merge(left=factor_df, right=ff5factor_df, how='left', on=['MONTH'])
del ff5factor_df


# OUT-OF-SAMPLE FACTORS #
oosFactor_df = pd.read_excel('Month_Factor_201909_202112.xlsx')


# COMBINE FACTORS #
factor_df = factor_df.loc[:, oosFactor_df.columns].copy()
factor_df = pd.concat([factor_df, oosFactor_df], axis=0, ignore_index=True)
del oosFactor_df
factor_df.drop_duplicates(subset=['MONTH'], inplace=True, ignore_index=True)


# HEDGE FUND INDEX RETURNS #
#hfRet_df = pd.read_excel('BARCLAYHEDGE_INDEXID_INDEXNAME_RETURN_1997_2022_INDEX_SAMPLE.xlsx')
hfRet_df = pd.read_excel('PSEUDODATA2.xlsx')
hfRet_df.set_index(keys=['INDEXID', 'INDEXNAME'], inplace=True)
monthList = []
for year in range(1997, 2023):
    for month in range(1, 13):
        yyyymm = year * 100 + month
        if yyyymm <= 202203:
            monthList.append(yyyymm)
        del month, yyyymm
    del year
hfRet_df.columns = monthList
del monthList
hfRet_df = hfRet_df.stack()
hfRet_df = hfRet_df.reset_index()
hfRet_df.columns = ['INDEXID', 'INDEXNAME', 'MONTH', 'RET']


# HEDGE FUND INDEX EXCESS RETURN #
hfExRet_df = pd.merge(left=hfRet_df, right=factor_df.loc[:,['MONTH', 'RF']], 
                      how='left', on='MONTH')
hfExRet_df['EXRET'] = hfExRet_df['RET'].copy() - hfExRet_df['RF'].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
hfExRet_df.drop(columns=['RET','RF'], inplace=True)
del hfRet_df


# PIVOT TABLE #
hfExRet_df = pd.pivot_table(hfExRet_df, values='EXRET', 
                            index='MONTH', columns='INDEXNAME')
hfExRet_df = hfExRet_df.reset_index()
hfExRet_df.columns = ['MONTH', 'CA', 'EM', 'LSEH', 'EMN', 'ED', 'FIA', 'FOF', 
                      'GM', 'IND', 'MS']


# DEFINE NUMBER OF INDECES #
indexList = hfExRet_df.columns[1:]


# MERGE WITH FACTOR TABLE #
hfExRet_df = pd.merge(left=hfExRet_df, right=factor_df, 
                      how='left', on=['MONTH'])
del factor_df


# END WITH 202008 #
hfExRet_df = hfExRet_df.loc[hfExRet_df['MONTH']<=202012, :].copy()


        ##################
        # 9-FACTOR MODEL #
        ##################


# DEFINE X #
MOD = 'HF9'
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    
    # RECEIVING DATA FRAME FOR PREDICTED VALUES #
    pred_df = hfExRet_df.loc[hfExRet_df['MONTH']>=201909,['MONTH', YNAME]].copy()
    pred_df.columns = ['MONTH','y_true']
    
    
    # TIME-SERIES REGRESSION #
    trainSet = hfExRet_df.loc[(hfExRet_df['MONTH']>=(2019-NUMYEAR_TRAIN)*100+9)&(hfExRet_df['MONTH']<=201908), 
                              ['MONTH', index]+XNAMELIST].copy()
    trainSet.dropna(inplace=True)
    factorModel = sm.OLS(trainSet[YNAME], sm.add_constant(trainSet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    del trainSet
    
    # PREDICTION #
    pred_df['y_pred'] = factorModel.predict(sm.add_constant(hfExRet_df.loc[hfExRet_df['MONTH']>=201909,XNAMELIST]))
    del factorModel
    
    
    # MSE #
    mse_df.loc[f'Barclay {index}', MOD] = mean_squared_error(pred_df['y_true'], pred_df['y_pred'])
    
    
    # HOUSE CLEAN #
    del index, YNAME, pred_df 


# HOUSE CLEAN #
del MOD, XNAMELIST


        ##############
        # CAPM MODEL #
        ##############


# DEFINE X #
MOD = 'CAPM'
XNAMELIST = ['MKT']


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    
    # RECEIVING DATA FRAME FOR PREDICTED VALUES #
    pred_df = hfExRet_df.loc[hfExRet_df['MONTH']>=201909,['MONTH', YNAME]].copy()
    pred_df.columns = ['MONTH','y_true']
    
    
    # TIME-SERIES REGRESSION #
    trainSet = hfExRet_df.loc[(hfExRet_df['MONTH']>=(2019-NUMYEAR_TRAIN)*100+9)&(hfExRet_df['MONTH']<=201908), 
                              ['MONTH', index]+XNAMELIST].copy()
    trainSet.dropna(inplace=True)
    factorModel = sm.OLS(trainSet[YNAME], sm.add_constant(trainSet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    del trainSet
    
    # PREDICTION #
    pred_df['y_pred'] = factorModel.predict(sm.add_constant(hfExRet_df.loc[hfExRet_df['MONTH']>=201909,XNAMELIST]))
    del factorModel
    
    
    # MSE #
    mse_df.loc[f'Barclay {index}', MOD] = mean_squared_error(pred_df['y_true'], pred_df['y_pred'])
    
    
    # HOUSE CLEAN #
    del index, YNAME, pred_df 


# HOUSE CLEAN #
del MOD, XNAMELIST


        ##############################
        # FANA FRENCH 5-FACTOR MODEL #
        ##############################


# DEFINE X #
MOD = 'FF5'
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    
    # RECEIVING DATA FRAME FOR PREDICTED VALUES #
    pred_df = hfExRet_df.loc[hfExRet_df['MONTH']>=201909,['MONTH', YNAME]].copy()
    pred_df.columns = ['MONTH','y_true']
    
    
    # TIME-SERIES REGRESSION #
    trainSet = hfExRet_df.loc[(hfExRet_df['MONTH']>=(2019-NUMYEAR_TRAIN)*100+9)&(hfExRet_df['MONTH']<=201908), 
                              ['MONTH', index]+XNAMELIST].copy()
    trainSet.dropna(inplace=True)
    factorModel = sm.OLS(trainSet[YNAME], sm.add_constant(trainSet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    del trainSet
    
    # PREDICTION #
    pred_df['y_pred'] = factorModel.predict(sm.add_constant(hfExRet_df.loc[hfExRet_df['MONTH']>=201909,XNAMELIST]))
    del factorModel
    
    
    # MSE #
    mse_df.loc[f'Barclay {index}', MOD] = mean_squared_error(pred_df['y_true'], pred_df['y_pred'])
    
    
    # HOUSE CLEAN #
    del index, YNAME, pred_df 


# HOUSE CLEAN #
del MOD, XNAMELIST


        #############################
        # FUNG-HSIEH 7-FACTOR MODEL #
        #############################


# DEFINE X #
MOD = 'FH7'
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']


# LOOP THROUGH CATEGORY #
for index in indexList:
    
    
    # DEFINE DEPENDENT VARIABLE #
    YNAME = index
    
    
    # RECEIVING DATA FRAME FOR PREDICTED VALUES #
    pred_df = hfExRet_df.loc[hfExRet_df['MONTH']>=201909,['MONTH', YNAME]].copy()
    pred_df.columns = ['MONTH','y_true']
    
    
    # TIME-SERIES REGRESSION #
    trainSet = hfExRet_df.loc[(hfExRet_df['MONTH']>=(2019-NUMYEAR_TRAIN)*100+9)&(hfExRet_df['MONTH']<=201908), 
                              ['MONTH', index]+XNAMELIST].copy()
    trainSet.dropna(inplace=True)
    factorModel = sm.OLS(trainSet[YNAME], sm.add_constant(trainSet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    del trainSet
    
    # PREDICTION #
    pred_df['y_pred'] = factorModel.predict(sm.add_constant(hfExRet_df.loc[hfExRet_df['MONTH']>=201909,XNAMELIST]))
    del factorModel
    
    
    # MSE #
    mse_df.loc[f'Barclay {index}', MOD] = mean_squared_error(pred_df['y_true'], pred_df['y_pred'])
    
    
    # HOUSE CLEAN #
    del index, YNAME, pred_df 


# HOUSE CLEAN #
del MOD, XNAMELIST


        ###############
        # HOUSE CLEAN #
        ###############


# HOUSE CLEAN #
del hfExRet_df, indexList



# OUTPUT RESULT TABLE #
mse_df.to_csv(tabDir + '\\Table8.csv')


# HOUSE CLEAN #
del mse_df, NUMYEAR_TRAIN






############################
#         TABLE 9          # 
############################


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
factor_df.drop(columns=['RF'], inplace=True)


# IMPORT FAMA-FRENCH FIVE FACTORS #
ff5factor_df = pd.read_excel('MONTH_FF5F.xlsx')
factor_df = pd.merge(left=factor_df, right=ff5factor_df, how='left', on=['MONTH'])
del ff5factor_df


    ##################
    # TERCILE SPREAD #
    ##################


# IMPORT NAT SCORES, RETURNS AND CAPPED MES #
natRet_df = pd.read_stata('NAT_RET.dta')
natRet_df['MONTH'] = natRet_df['DATE'].dt.year*100+natRet_df['DATE'].dt.month
natRet_df = natRet_df.loc[natRet_df['MONTH']<=201912, :].copy()
    
    
# CUTTING POINT BY NON-MICRO STOCKS #
sizeCut = natRet_df.groupby(by=['MONTH'])['ME'].quantile(q=0.2).reset_index()
sizeCut.columns = ['MONTH', 'ME_CUT']
subSample_df = pd.merge(left=natRet_df, right=sizeCut, how='left', on=['MONTH'])
subSample_df = subSample_df.loc[subSample_df['ME']>=subSample_df['ME_CUT'], ['PERMNO', 'MONTH', 'NAT']].copy()
natCut = pd.concat([subSample_df.groupby(by=['MONTH'])['NAT'].quantile(q=1/3), 
                    subSample_df.groupby(by=['MONTH'])['NAT'].quantile(q=2/3)], axis=1)
natCut = natCut.reset_index()
natCut.columns = ['MONTH', 'NAT_1', 'NAT_2']
del sizeCut, subSample_df
natRet_df = pd.merge(left=natRet_df, right=natCut, how='left', on=['MONTH'])
del natCut

natRet_df['GROUP'] = np.select([natRet_df['NAT']<=natRet_df['NAT_1'], 
                                natRet_df['NAT']<=natRet_df['NAT_2']], 
                               [1,2], default=3)


# DECILE VALUE-WEIGHTED RETURN SPREAD #
natVwSprd_df = natRet_df.groupby(by=['MONTH','GROUP']).apply(lambda x: np.average(x['RET'], weights=x['ME'])).reset_index()
natVwSprd_df = pd.pivot(natVwSprd_df, columns='GROUP', index='MONTH', values=0)
natVwSprd_df['VW'] = natVwSprd_df[3] - natVwSprd_df[1]


# DECILE EQUAL-WEIGHTED RETURN SPREAD #
natEwSprd_df = natRet_df.groupby(by=['MONTH','GROUP'])['RET'].mean().reset_index()
natEwSprd_df = pd.pivot(natEwSprd_df, columns='GROUP', index='MONTH', values='RET')
natEwSprd_df['EW'] = natEwSprd_df[3] - natEwSprd_df[1]


# MERGE TWO SPREAD SERIES #
natRet_df = pd.concat([natEwSprd_df['EW'], natVwSprd_df['VW']], axis=1)
natRet_df = natRet_df.reset_index()
del natEwSprd_df, natVwSprd_df


# PREPARE REGRESSION DATA #
regData_df = pd.merge(left=natRet_df, right=factor_df, 
                      how='inner', on=['MONTH'])


# RECEIVING DATA FRAME FOR ALPHA AND ADJ R-SQUARED #
tercileOut_df = pd.DataFrame(columns = ['EW Alpha', 'EW Adj-R2',
                                        'VW Alpha', 'VW Adj-R2'], 
                             index = ['HF9 Est', 'HF9 T-stat', 
                                      'CAPM Est', 'CAPM T-stat',
                                      'FF5 Est', 'FF5 T-stat', 
                                      'FH7 Est', 'FH7 T-stat'])


        #######
        # HF9 #
        #######
    

# DEFINE FACTOR LIST #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# LOOP THROUGH DECILE PORTFOLIOS #
for port in ['EW', 'VW']:
    
    
    # PORTFOLIO RETURN ~ HF9 FACTORS #
    factorModel = sm.OLS(regData_df[port], sm.add_constant(regData_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    tercileOut_df.loc['HF9 Est', [f'{port} Alpha', f'{port} Adj-R2']] = factorModel.params[0]*12*100, factorModel.rsquared_adj
    tercileOut_df.loc['HF9 T-stat', f'{port} Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    del port, factorModel
    

# HOUSE CLEAN #
del XNAMELIST


        ########
        # CAPM #
        ########
    

# DEFINE FACTOR LIST #
XNAMELIST = ['MKT']


# LOOP THROUGH DECILE PORTFOLIOS #
for port in ['EW', 'VW']:
    
    
    # PORTFOLIO RETURN ~ HF9 FACTORS #
    factorModel = sm.OLS(regData_df[port], sm.add_constant(regData_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    tercileOut_df.loc['CAPM Est', [f'{port} Alpha', f'{port} Adj-R2']] = factorModel.params[0]*12*100, factorModel.rsquared_adj
    tercileOut_df.loc['CAPM T-stat', f'{port} Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    del port, factorModel
    

# HOUSE CLEAN #
del XNAMELIST


        #######
        # FF5 #
        #######
    

# DEFINE FACTOR LIST #
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']


# LOOP THROUGH DECILE PORTFOLIOS #
for port in ['EW', 'VW']:
    
    
    # PORTFOLIO RETURN ~ HF9 FACTORS #
    factorModel = sm.OLS(regData_df[port], sm.add_constant(regData_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    tercileOut_df.loc['FF5 Est', [f'{port} Alpha', f'{port} Adj-R2']] = factorModel.params[0]*12*100, factorModel.rsquared_adj
    tercileOut_df.loc['FF5 T-stat', f'{port} Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    del port, factorModel
    

# HOUSE CLEAN #
del XNAMELIST


        #######
        # FH7 #
        #######
    

# DEFINE FACTOR LIST #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']


# LOOP THROUGH DECILE PORTFOLIOS #
for port in ['EW', 'VW']:
    
    
    # PORTFOLIO RETURN ~ HF9 FACTORS #
    factorModel = sm.OLS(regData_df[port], sm.add_constant(regData_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    tercileOut_df.loc['FH7 Est', [f'{port} Alpha', f'{port} Adj-R2']] = factorModel.params[0]*12*100, factorModel.rsquared_adj
    tercileOut_df.loc['FH7 T-stat', f'{port} Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    del port, factorModel
    

# HOUSE CLEAN #
del XNAMELIST


# OUTPUT RESULTS #
tercileOut_df.to_excel(tabDir + '\\Table9_PanelA.xlsx')


# HOUSE CLEAN #
del tercileOut_df, natRet_df, regData_df


    ###################
    # QUINTILE SPREAD #
    ###################


# IMPORT NAT SCORES, RETURNS AND CAPPED MES #
natRet_df = pd.read_stata('NAT_RET.dta')
natRet_df['MONTH'] = natRet_df['DATE'].dt.year*100+natRet_df['DATE'].dt.month
natRet_df = natRet_df.loc[natRet_df['MONTH']<=201912, :].copy()



# CUTTING POINT BY NON-MICRO STOCKS #
sizeCut = natRet_df.groupby(by=['MONTH'])['ME'].quantile(q=0.2).reset_index()
sizeCut.columns = ['MONTH', 'ME_CUT']
subSample_df = pd.merge(left=natRet_df, right=sizeCut, how='left', on=['MONTH'])
subSample_df = subSample_df.loc[subSample_df['ME']>=subSample_df['ME_CUT'], ['PERMNO', 'MONTH', 'NAT']].copy()
natCut = pd.concat([subSample_df.groupby(by=['MONTH'])['NAT'].quantile(q=1/5), 
                    subSample_df.groupby(by=['MONTH'])['NAT'].quantile(q=2/5),
                    subSample_df.groupby(by=['MONTH'])['NAT'].quantile(q=3/5), 
                    subSample_df.groupby(by=['MONTH'])['NAT'].quantile(q=4/5)], axis=1)
natCut = natCut.reset_index()
natCut.columns = ['MONTH', 'NAT_1', 'NAT_2', 'NAT_3', 'NAT_4']
del sizeCut, subSample_df
natRet_df = pd.merge(left=natRet_df, right=natCut, how='left', on=['MONTH'])
del natCut

natRet_df['GROUP'] = np.select([natRet_df['NAT']<=natRet_df['NAT_1'], 
                                natRet_df['NAT']<=natRet_df['NAT_2'], 
                                natRet_df['NAT']<=natRet_df['NAT_3'], 
                                natRet_df['NAT']<=natRet_df['NAT_4']], 
                               [1,2,3,4], default=5)


# DECILE VALUE-WEIGHTED RETURN SPREAD #
natVwSprd_df = natRet_df.groupby(by=['MONTH','GROUP']).apply(lambda x: np.average(x['RET'], weights=x['ME'])).reset_index()
natVwSprd_df = pd.pivot(natVwSprd_df, columns='GROUP', index='MONTH', values=0)
natVwSprd_df['VW'] = natVwSprd_df[5] - natVwSprd_df[1]


# DECILE EQUAL-WEIGHTED RETURN SPREAD #
natEwSprd_df = natRet_df.groupby(by=['MONTH','GROUP'])['RET'].mean().reset_index()
natEwSprd_df = pd.pivot(natEwSprd_df, columns='GROUP', index='MONTH', values='RET')
natEwSprd_df['EW'] = natEwSprd_df[5] - natEwSprd_df[1]


# MERGE TWO SPREAD SERIES #
natRet_df = pd.concat([natEwSprd_df['EW'], natVwSprd_df['VW']], axis=1)
natRet_df = natRet_df.reset_index()
del natEwSprd_df, natVwSprd_df


# PREPARE REGRESSION DATA #
regData_df = pd.merge(left=natRet_df, right=factor_df, 
                      how='inner', on=['MONTH'])


# RECEIVING DATA FRAME FOR ALPHA AND ADJ R-SQUARED #
quintileOut_df = pd.DataFrame(columns = ['EW Alpha', 'EW Adj-R2',
                                         'VW Alpha', 'VW Adj-R2'], 
                              index = ['HF9 Est', 'HF9 T-stat', 
                                       'CAPM Est', 'CAPM T-stat',
                                       'FF5 Est', 'FF5 T-stat', 
                                       'FH7 Est', 'FH7 T-stat'])


        #######
        # HF9 #
        #######
    

# DEFINE FACTOR LIST #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# LOOP THROUGH DECILE PORTFOLIOS #
for port in ['EW', 'VW']:
    
    
    # PORTFOLIO RETURN ~ HF9 FACTORS #
    factorModel = sm.OLS(regData_df[port], sm.add_constant(regData_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    quintileOut_df.loc['HF9 Est', [f'{port} Alpha', f'{port} Adj-R2']] = factorModel.params[0]*12*100, factorModel.rsquared_adj
    quintileOut_df.loc['HF9 T-stat', f'{port} Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    del port, factorModel
    

# HOUSE CLEAN #
del XNAMELIST


        ########
        # CAPM #
        ########
    

# DEFINE FACTOR LIST #
XNAMELIST = ['MKT']


# LOOP THROUGH DECILE PORTFOLIOS #
for port in ['EW', 'VW']:
    
    
    # PORTFOLIO RETURN ~ HF9 FACTORS #
    factorModel = sm.OLS(regData_df[port], sm.add_constant(regData_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    quintileOut_df.loc['CAPM Est', [f'{port} Alpha', f'{port} Adj-R2']] = factorModel.params[0]*12*100, factorModel.rsquared_adj
    quintileOut_df.loc['CAPM T-stat', f'{port} Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    del port, factorModel
    

# HOUSE CLEAN #
del XNAMELIST


        #######
        # FF5 #
        #######
    

# DEFINE FACTOR LIST #
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']


# LOOP THROUGH DECILE PORTFOLIOS #
for port in ['EW', 'VW']:
    
    
    # PORTFOLIO RETURN ~ HF9 FACTORS #
    factorModel = sm.OLS(regData_df[port], sm.add_constant(regData_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    quintileOut_df.loc['FF5 Est', [f'{port} Alpha', f'{port} Adj-R2']] = factorModel.params[0]*12*100, factorModel.rsquared_adj
    quintileOut_df.loc['FF5 T-stat', f'{port} Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    del port, factorModel
    

# HOUSE CLEAN #
del XNAMELIST


        #######
        # FH7 #
        #######
    

# DEFINE FACTOR LIST #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']


# LOOP THROUGH DECILE PORTFOLIOS #
for port in ['EW', 'VW']:
    
    
    # PORTFOLIO RETURN ~ HF9 FACTORS #
    factorModel = sm.OLS(regData_df[port], sm.add_constant(regData_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    quintileOut_df.loc['FH7 Est', [f'{port} Alpha', f'{port} Adj-R2']] = factorModel.params[0]*12*100, factorModel.rsquared_adj
    quintileOut_df.loc['FH7 T-stat', f'{port} Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    del port, factorModel
    

# HOUSE CLEAN #
del XNAMELIST


# OUTPUT RESULTS #
quintileOut_df.to_excel(tabDir + '\\Table9_PanelB.xlsx')


# HOUSE CLEAN #
del natRet_df, quintileOut_df, regData_df, factor_df






############################
#         TABLE 10         # 
############################


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
factor_df.drop(columns=['RF'], inplace=True)


# IMPORT FAMA-FRENCH FIVE FACTORS #
ff5factor_df = pd.read_excel('MONTH_FF5F.xlsx')
factor_df = pd.merge(left=factor_df, right=ff5factor_df, how='left', on=['MONTH'])
del ff5factor_df


# IMPORT ADDITIONAL FACTORS #
addFactor_df = pd.read_excel('MONTH_ADDITIONAL_FACTOR.xlsx')
factor_df = pd.merge(left=factor_df, right=addFactor_df, how='left', on=['MONTH'])
del addFactor_df


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET', 'CAT', 'AUM']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# REPLACE ZERO AUM WITH MISSING #
hfExRet_df['AUM'].replace(0, np.NaN, inplace=True)


    ##################################
    # HEDGE FUND INDUSTRY PORTFOLIOS #
    ##################################


# RECEIVING DATA FRAMES #
indOut_df = pd.DataFrame(columns=['HF9 Alpha', 'HF9 Adj-R2',
                                  'ABGS Alpha', 'ABGS Adj-R2',
                                  'FH7EC Alpha', 'FH7EC Adj-R2',
                                  'FH7CP Alpha', 'FH7CP Adj-R2'], 
                         index= ['EW Est', 'EW T-stat', 'VW Est', 'VW T-stat'])


# IF MISSING AUM, WEIGHTS SET AS ZERO TO EXCLUDE RETURNS #
hfExRet_df['AUM'].fillna(0, inplace=True)


# CONSTRUCT EW AND VW HEDGE FUND INDUSTRY RETURNS #
indExRet = pd.concat([hfExRet_df.groupby(by=['MONTH'])['EXRET'].mean(), 
                      hfExRet_df.groupby(by=['MONTH']).apply(lambda x: np.average(x['EXRET'], weights=x['AUM']))], axis=1)
indExRet.columns = ['EXRET_EW','EXRET_VW']
indExRet = indExRet.reset_index()


# RESTORE AUM MISSING #
hfExRet_df['AUM'].replace(0, np.NaN, inplace=True)


# MERGE WITH FACTOR TABLE #
indExRet = pd.merge(left=indExRet, right=factor_df, how='left', on=['MONTH'])


        ################################################
        # EW INDUSTRY PORTFOLIO TIME-SERIES REGRESSION #
        ################################################
    
    
# DEFINE DEPENDENT VARIABLE #
YNAME = 'EXRET_EW'


# 9-FACTOR MODEL #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['EW Est', ['HF9 Alpha', 'HF9 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['EW T-stat', 'HF9 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# ABGS MODEL #
XNAMELIST = ['FF5F_MKT','FF5F_SMB','ILL','BAB','SVAR','CARRY','TSMOM']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['EW Est', ['ABGS Alpha', 'ABGS Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['EW T-stat', 'ABGS Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# FUNG-HSIEH 7-FACTOR + EMERGING MARKET + CORRELATION RISK MODEL #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT', 'EMKT', 'CRK']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['EW Est', ['FH7EC Alpha', 'FH7EC Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['EW T-stat', 'FH7EC Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# FUNG-HSIEH 7-FACTOR + OPTION FACTORS MODEL #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT', 'OTMC', 'OTMP']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['EW Est', ['FH7CP Alpha', 'FH7CP Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['EW T-stat', 'FH7CP Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# HOUSE CLEAN #
del YNAME


        ################################################
        # VW INDUSTRY PORTFOLIO TIME-SERIES REGRESSION #
        ################################################
    
    
# DEFINE DEPENDENT VARIABLE #
YNAME = 'EXRET_VW'


# 9-FACTOR MODEL #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['VW Est', ['HF9 Alpha', 'HF9 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['VW T-stat', 'HF9 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# ABGS MODEL #
XNAMELIST = ['FF5F_MKT','FF5F_SMB','ILL','BAB','SVAR','CARRY','TSMOM']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['VW Est', ['ABGS Alpha', 'ABGS Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['VW T-stat', 'ABGS Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# FUNG-HSIEH 7-FACTOR + EMERGING MARKET + CORRELATION RISK MODEL #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT', 'EMKT', 'CRK']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['VW Est', ['FH7EC Alpha', 'FH7EC Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['VW T-stat', 'FH7EC Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# FUNG-HSIEH 7-FACTOR + OPTION FACTORS MODEL #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT', 'OTMC', 'OTMP']
factorModel = sm.OLS(indExRet[YNAME], sm.add_constant(indExRet.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})

indOut_df.loc['VW Est', ['FH7CP Alpha', 'FH7CP Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
indOut_df.loc['VW T-stat', 'FH7CP Alpha'] = f'({factorModel.tvalues[0]: .2f})'
del XNAMELIST, factorModel


# HOUSE CLEAN #
del YNAME
del indExRet


    #####################################################
    # EQUAL-WEIGHTED STRATEGY-CATEGORY-BASED PORTFOLIOS #
    #####################################################


# RECEIVING DATA FRAMES #
ewCatOut_df = pd.DataFrame(columns=['HF9 Alpha', 'HF9 Adj-R2',
                                    'ABGS Alpha', 'ABGS Adj-R2',
                                    'FH7EC Alpha', 'FH7EC Adj-R2',
                                    'FH7CP Alpha', 'FH7CP Adj-R2'], 
                           index= ['EW CA Est', 'EW CA T-stat', 
                                   'EW DSB Est', 'EW DSB T-stat',
                                   'EW ED Est', 'EW ED T-stat', 
                                   'EW EM Est', 'EW EM T-stat',
                                   'EW EMN Est', 'EW EMN T-stat', 
                                   'EW FIA Est', 'EW FIA T-stat',
                                   'EW FOF Est', 'EW FOF T-stat', 
                                   'EW GM Est', 'EW GM T-stat',
                                   'EW LSEH Est', 'EW LSEH T-stat', 
                                   'EW MS Est', 'EW MS T-stat',
                                   'Average |Alpha|', 'Average Adj-R2', 'F-GRS'])


# CONSTRUCT CATEGORY HEDGE FUND EXCESS RETURNS #
catExRet_df = pd.concat([hfExRet_df.groupby(by=['CAT','MONTH'])['EXRET'].mean(), 
                         hfExRet_df.groupby(by=['CAT','MONTH'])['EXRET'].count()], 
                        axis=1)
catExRet_df.columns = ['EXRET','NUMFUNDS']
catExRet_df = catExRet_df.reset_index()
# 2,697 category-month observations; 10 unique categories
catList = list(catExRet_df['CAT'].unique())


# PIVOT TABLE #
catExRet_df = pd.pivot_table(catExRet_df, values='EXRET', 
                             index='MONTH', columns='CAT')
catExRet_df = catExRet_df.reset_index()

# MERGE WITH FACTOR TABLE #
catExRet_df = pd.merge(left=catExRet_df, right=factor_df, 
                       how='left', on=['MONTH'])


        ##################
        # 9-FACTOR MODEL #
        ##################


# DEFINE X #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    ewCatOut_df.loc[f'EW {cat} Est', ['HF9 Alpha', 'HF9 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    ewCatOut_df.loc[f'EW {cat} T-stat', 'HF9 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
ewCatOut_df.loc['Average |Alpha|', 'HF9 Alpha'] = np.abs(alpha_df['Alpha']).mean()
ewCatOut_df.loc['Average Adj-R2', 'HF9 Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
ewCatOut_df.loc['F-GRS', 'HF9 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ##############
        # ABGS MODEL #
        ##############


# DEFINE X #
XNAMELIST = ['FF5F_MKT','FF5F_SMB','ILL','BAB','SVAR','CARRY','TSMOM']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    ewCatOut_df.loc[f'EW {cat} Est', ['ABGS Alpha', 'ABGS Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    ewCatOut_df.loc[f'EW {cat} T-stat', 'ABGS Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
ewCatOut_df.loc['Average |Alpha|', 'ABGS Alpha'] = np.abs(alpha_df['Alpha']).mean()
ewCatOut_df.loc['Average Adj-R2', 'ABGS Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
ewCatOut_df.loc['F-GRS', 'ABGS Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ###########################
        # FH7F + EMKT + CRK MODEL #
        ###########################


# DEFINE X #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT', 'EMKT', 'CRK']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    ewCatOut_df.loc[f'EW {cat} Est', ['FH7EC Alpha', 'FH7EC Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    ewCatOut_df.loc[f'EW {cat} T-stat', 'FH7EC Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
ewCatOut_df.loc['Average |Alpha|', 'FH7EC Alpha'] = np.abs(alpha_df['Alpha']).mean()
ewCatOut_df.loc['Average Adj-R2', 'FH7EC Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
ewCatOut_df.loc['F-GRS', 'FH7EC Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ############################
        # FH7F + OTMC + OTMP MODEL #
        ############################


# DEFINE X #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT', 'OTMC', 'OTMP']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    ewCatOut_df.loc[f'EW {cat} Est', ['FH7CP Alpha', 'FH7CP Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    ewCatOut_df.loc[f'EW {cat} T-stat', 'FH7CP Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
ewCatOut_df.loc['Average |Alpha|', 'FH7CP Alpha'] = np.abs(alpha_df['Alpha']).mean()
ewCatOut_df.loc['Average Adj-R2', 'FH7CP Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
ewCatOut_df.loc['F-GRS', 'FH7CP Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 
del catExRet_df, catList


    #####################################################
    # VALUE-WEIGHTED STRATEGY-CATEGORY-BASED PORTFOLIOS #
    #####################################################


# RECEIVING DATA FRAMES #
vwCatOut_df = pd.DataFrame(columns=['HF9 Alpha', 'HF9 Adj-R2',
                                    'ABGS Alpha', 'ABGS Adj-R2',
                                    'FH7EC Alpha', 'FH7EC Adj-R2',
                                    'FH7CP Alpha', 'FH7CP Adj-R2'], 
                           index= ['VW CA Est', 'VW CA T-stat', 
                                   'VW DSB Est', 'VW DSB T-stat',
                                   'VW ED Est', 'VW ED T-stat', 
                                   'VW EM Est', 'VW EM T-stat',
                                   'VW EMN Est', 'VW EMN T-stat', 
                                   'VW FIA Est', 'VW FIA T-stat',
                                   'VW FOF Est', 'VW FOF T-stat', 
                                   'VW GM Est', 'VW GM T-stat',
                                   'VW LSEH Est', 'VW LSEH T-stat', 
                                   'VW MS Est', 'VW MS T-stat',
                                   'Average |Alpha|', 'Average Adj-R2', 'F-GRS'])


# IF MISSING AUM, WEIGHTS SET AS ZERO TO EXCLUDE RETURNS #
hfExRet_df['AUM'].fillna(0, inplace=True)


# CONSTRUCT CATEGORY HEDGE FUND EXCESS RETURNS #
catExRet_df = pd.concat([hfExRet_df.groupby(by=['CAT','MONTH']).apply(lambda x: np.average(x['EXRET'], weights=x['AUM'])), 
                         hfExRet_df.groupby(by=['CAT','MONTH'])['AUM'].count()], 
                        axis=1)
catExRet_df.columns = ['EXRET','NUMFUNDS']
catExRet_df = catExRet_df.reset_index()
# 2,697 category-month observations; 10 unique categories
catList = list(catExRet_df['CAT'].unique())


# RESTORE AUM MISSING #
hfExRet_df['AUM'].replace(0, np.NaN, inplace=True)


# PIVOT TABLE #
catExRet_df = pd.pivot_table(catExRet_df, values='EXRET', 
                             index='MONTH', columns='CAT')
catExRet_df = catExRet_df.reset_index()

# MERGE WITH FACTOR TABLE #
catExRet_df = pd.merge(left=catExRet_df, right=factor_df, 
                       how='left', on=['MONTH'])


        ##################
        # 9-FACTOR MODEL #
        ##################


# DEFINE X #
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    vwCatOut_df.loc[f'VW {cat} Est', ['HF9 Alpha', 'HF9 Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    vwCatOut_df.loc[f'VW {cat} T-stat', 'HF9 Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
vwCatOut_df.loc['Average |Alpha|', 'HF9 Alpha'] = np.abs(alpha_df['Alpha']).mean()
vwCatOut_df.loc['Average Adj-R2', 'HF9 Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
vwCatOut_df.loc['F-GRS', 'HF9 Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ##############
        # ABGS MODEL #
        ##############


# DEFINE X #
XNAMELIST = ['FF5F_MKT','FF5F_SMB','ILL','BAB','SVAR','CARRY','TSMOM']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    vwCatOut_df.loc[f'VW {cat} Est', ['ABGS Alpha', 'ABGS Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    vwCatOut_df.loc[f'VW {cat} T-stat', 'ABGS Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
vwCatOut_df.loc['Average |Alpha|', 'ABGS Alpha'] = np.abs(alpha_df['Alpha']).mean()
vwCatOut_df.loc['Average Adj-R2', 'ABGS Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
vwCatOut_df.loc['F-GRS', 'ABGS Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ###########################
        # FH7F + EMKT + CRK MODEL #
        ###########################


# DEFINE X #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT', 'EMKT', 'CRK']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    vwCatOut_df.loc[f'VW {cat} Est', ['FH7EC Alpha', 'FH7EC Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    vwCatOut_df.loc[f'VW {cat} T-stat', 'FH7EC Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
vwCatOut_df.loc['Average |Alpha|', 'FH7EC Alpha'] = np.abs(alpha_df['Alpha']).mean()
vwCatOut_df.loc['Average Adj-R2', 'FH7EC Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
vwCatOut_df.loc['F-GRS', 'FH7EC Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 


        ############################
        # FH7F + OTMC + OTMP MODEL #
        ############################


# DEFINE X #
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT', 'OTMC', 'OTMP']


# RECEIVING DATA FRAMES #
alpha_df = pd.DataFrame(columns=['Alpha', 'Adj-R2'], index=catList)
epsilon_df = pd.DataFrame(columns=catList, index=catExRet_df['MONTH'])


# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DEFINE DEPENDENT VARIABLE AND XS #
    YNAME = cat
    
    # TIME-SERIES REGRESSION #
    reg_df = catExRet_df.loc[:, ['MONTH', YNAME]+XNAMELIST].copy()
    reg_df.dropna(inplace=True)
    factorModel = sm.OLS(reg_df[YNAME], sm.add_constant(reg_df.loc[:, XNAMELIST])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    
    vwCatOut_df.loc[f'VW {cat} Est', ['FH7CP Alpha', 'FH7CP Adj-R2']] = factorModel.params[0], factorModel.rsquared_adj
    vwCatOut_df.loc[f'VW {cat} T-stat', 'FH7CP Alpha'] = f'({factorModel.tvalues[0]: .2f})'
    
    alpha_df.loc[cat, :] = factorModel.params[0], factorModel.rsquared_adj
    reg_df['resid'] = factorModel.resid
    reg_df = reg_df.set_index(keys=['MONTH'])
    epsilon_df[cat] = reg_df['resid'].copy()
    del cat, YNAME, reg_df, factorModel


# AVERAGE STATISTICS #
vwCatOut_df.loc['Average |Alpha|', 'FH7CP Alpha'] = np.abs(alpha_df['Alpha']).mean()
vwCatOut_df.loc['Average Adj-R2', 'FH7CP Adj-R2'] = alpha_df['Adj-R2'].mean()


# GRS TEST #
# EXCLUDE DSB FOR BALANCED DATA #
alpha_df = alpha_df.loc[alpha_df.index!='DSB', 'Alpha'].copy()
epsilon_df.drop(columns=['DSB'], inplace=True)
factorGRS_df = factor_df.loc[factor_df["MONTH"].isin(list(epsilon_df.index)), 
                             ["MONTH"]+XNAMELIST].copy() 
factorGRS_df.set_index(keys='MONTH', inplace=True)
grsF, grsP = grsTestPvalue(alpha_df, factorGRS_df, epsilon_df)
vwCatOut_df.loc['F-GRS', 'FH7CP Alpha'] = grsF


# HOUSE CLEAN #
del XNAMELIST, alpha_df, epsilon_df, factorGRS_df, grsF, grsP 
del catExRet_df, catList


    ##################
    # OUTPUT RESULTS #
    ##################


# OUTPUT RESULT TABLE #
out_df = pd.concat([indOut_df, ewCatOut_df, vwCatOut_df])
out_df.to_excel(tabDir + '\\Table10.xlsx')


# HOUSE CLEAN #
del ewCatOut_df, factor_df, hfExRet_df, indOut_df, out_df, vwCatOut_df






############################
#         TABLE 11         # 
############################


# IMPORT FUNCTION #
from linearmodels.panel import PanelOLS


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
rf_df = factor_df.loc[:, ['MONTH','RF']].copy()
factor_df.drop(columns=['RF'], inplace=True)


# IMPORT FAMA-FRENCH FIVE FACTORS #
ff5factor_df = pd.read_excel('MONTH_FF5F.xlsx')
factor_df = pd.merge(left=factor_df, right=ff5factor_df, how='left', on=['MONTH'])
del ff5factor_df


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET', 'AUM']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# REPLACE ZERO AUM WITH MISSING #
hfExRet_df['AUM'].replace(0, np.NaN, inplace=True)


# CHECK IF HF TABLE DO NOT SKIP MONTHS #
hfExRet_df['MONTH'] = pd.to_datetime(hfExRet_df['MONTH'].astype(str), 
                                     format='%Y%m')
def expand_id_months(group):
    id_ = group['FUNDID'].iloc[0]
    min_month = group['MONTH'].min()
    max_month = group['MONTH'].max()
    full_months = pd.date_range(start=min_month, end=max_month, freq='MS')
    full_group = pd.DataFrame({'FUNDID': id_, 'MONTH': full_months})
    merged = pd.merge(left=full_group, right=group, 
                      on=['FUNDID','MONTH'], how='left')
    return merged

hfExRet_df = hfExRet_df.groupby('FUNDID').apply(expand_id_months).reset_index(drop=True)
hfExRet_df['MONTH'] = hfExRet_df['MONTH'].dt.year*100 + hfExRet_df['MONTH'].dt.month


    ######################################
    # FUND ALPHA WITH 24 ROLLING WINDOWS #
    ######################################


# MERGE RETURNS WITH FACTORS #
merged_df = pd.merge(left=hfExRet_df, right=factor_df, how='left', on='MONTH')
merged_df = merged_df.sort_values(['FUNDID','MONTH'])


# FUNCTION FOR ROLLING REGRESSION #
def rolling_alpha(group, xList, suffix, window=24, min_periods=24):
    
    
    # RECEIVING LIST FOR ALPHAS #
    alphas = [np.nan] * len(group)
    months = group['MONTH'].reset_index(drop=True)
    
    
    # LOOP OVER DATA #
    for idx in range(len(group)):
        currentMonth = months[idx]
        
        
        # CHECK IF CURRENT PREDICTION MONTH IS JANUARY #
        if currentMonth % 100 == 1:
            
            
            # CHECK IF ENOUGH DATA FOR REGRESSION #
            if idx >= window: 
                
                
                # ROLLING WINDOW SLICE #
                window_slice = group.iloc[idx - window:idx, :].copy()
                
                
                # CHECK FOR MINIMUM NON-MISSING OBSERVATIONS #
                if window_slice['EXRET'].dropna().shape[0] >= min_periods:
                    
                    
                    # DEFINE Y AND XS #
                    Y = window_slice['EXRET'].copy()
                    X = sm.add_constant(window_slice.loc[:, xList])
                    
                    
                    # ALPHA ESTIMATES #
                    model = sm.OLS(Y, X, missing='drop').fit()
                    alpha = model.params['const']
                    alphas[idx] = alpha 
                    
                    
                    # HOUSE CLEAN #
                    del Y, X, model, alpha
            
                # HOUSE CLEAN #
                del window_slice
                
        # HOUSE CLEAN #
        del idx, currentMonth
        
        
    # ASSIGN ALPHAS BACK TO GROUP #
    group = group.reset_index(drop=True)
    group[f'{suffix}'] = alphas
    return group


# ROLLING WINDOW ALPHAS #
hfAlpha_df = merged_df.loc[:, ['FUNDID','MONTH']].copy()
# HF9
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
temp = merged_df.groupby('FUNDID').apply(rolling_alpha, xList=XNAMELIST, suffix='HF9').reset_index(drop=True)
hfAlpha_df['HF9'] = temp['HF9'].copy()
del XNAMELIST, temp
XNAMELIST = ['MKT']
temp = merged_df.groupby('FUNDID').apply(rolling_alpha, xList=XNAMELIST, suffix='CAPM').reset_index(drop=True)
hfAlpha_df['CAPM'] = temp['CAPM'].copy()
del XNAMELIST, temp
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']
temp = merged_df.groupby('FUNDID').apply(rolling_alpha, xList=XNAMELIST, suffix='FF5').reset_index(drop=True)
hfAlpha_df['FF5'] = temp['FF5'].copy()
del XNAMELIST, temp
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']
temp = merged_df.groupby('FUNDID').apply(rolling_alpha, xList=XNAMELIST, suffix='FH7').reset_index(drop=True)
hfAlpha_df['FH7'] = temp['FH7'].copy()
del XNAMELIST, temp


# ONLY KEEP YEARLY OBSERVATIONS #
hfAlpha_df['YEAR'] = hfAlpha_df['MONTH'] // 100
hfAlpha_df['MONTH'] = hfAlpha_df['MONTH'] % 100
hfAlpha_df = hfAlpha_df.loc[hfAlpha_df['MONTH']==1, :].copy()
hfAlpha_df.drop(columns=['MONTH'], inplace=True)


# HOUSE CLEAN #
del merged_df


    ##############
    # FUND FLOWS #
    ##############


# RECOVER YEARLY NET-OF-FEE RETURNS #
merged_df = pd.merge(left=hfExRet_df, right=rf_df, how='left', on=['MONTH'])
merged_df = merged_df.sort_values(['FUNDID','MONTH'])
merged_df['RET'] = np.log(1 + (merged_df['EXRET'] + merged_df['RF']) / 100)
merged_df['RET'] = (np.exp(merged_df.groupby('FUNDID')['RET'].rolling(window=12).sum()) - 1).values 


# ONLY KEEP DECEMBER OBSERVATIONS # 
merged_df['YEAR'] = merged_df['MONTH'] // 100
merged_df['MONTH'] = merged_df['MONTH'] % 100
merged_df = merged_df.loc[merged_df['MONTH']==12, ['FUNDID', 'YEAR', 'AUM', 'RET']].copy()


# MONTHLY FLOW IN PERCENTAGE #
merged_df = merged_df.sort_values(by=['FUNDID','YEAR'])
merged_df['AUM_L1'] = merged_df.groupby('FUNDID')['AUM'].shift(1)
merged_df['FLOW'] = 100 * (merged_df['AUM'] / merged_df['AUM_L1'] - (1 + merged_df['RET']))
hfFlow_df = merged_df.loc[:, ['FUNDID', 'YEAR', 'FLOW', 'AUM']].copy()


# CROSS-SECTIONAL WINSORIZE #
hfFlowHL_df = pd.concat([hfFlow_df.groupby('YEAR')['FLOW'].quantile(q=0.99),
                         hfFlow_df.groupby('YEAR')['FLOW'].quantile(q=0.01)],
                        axis=1)
hfFlowHL_df.columns = ['H', 'L']
hfFlow_df = pd.merge(left=hfFlow_df, right=hfFlowHL_df, how='left', on='YEAR')
hfFlow_df.loc[hfFlow_df['FLOW']>hfFlow_df['H'], 'FLOW'] = hfFlow_df.loc[hfFlow_df['FLOW']>hfFlow_df['H'], 'H']
hfFlow_df.loc[hfFlow_df['FLOW']<hfFlow_df['L'], 'FLOW'] = hfFlow_df.loc[hfFlow_df['FLOW']<hfFlow_df['L'], 'L']
hfFlow_df.drop(columns=['H','L'], inplace=True)
del merged_df, hfFlowHL_df


    #############################
    # FLOW-PERFORMANCE ANALYSIS #
    #############################


# MERGE WITH ALPHA DATA #
hfFlowAlpha_df = pd.merge(left=hfFlow_df, right=hfAlpha_df, 
                          how='left', on=['FUNDID','YEAR'])
hfFlowAlpha_df.dropna(inplace=True)


# ANNUALIZE ALPHAS #
hfFlowAlpha_df['HF9'] = hfFlowAlpha_df['HF9'] * 12
hfFlowAlpha_df['CAPM'] = hfFlowAlpha_df['CAPM'] * 12
hfFlowAlpha_df['FF5'] = hfFlowAlpha_df['FF5'] * 12
hfFlowAlpha_df['FH7'] = hfFlowAlpha_df['FH7'] * 12


# RECEIVING LIST FOR FLOWS OF ALPHA-SORTED PORTFOLIOS #
alphaFLowDf_list = list()


# LOOP THROUGH MODELS #
for model in ['HF9', 'CAPM', 'FF5', 'FH7']:
    
    
    # EXTRACT FLOW AND MODEL ALPHAS #
    alphaSortedFlow_df = hfFlowAlpha_df.loc[:, ['FUNDID','YEAR','FLOW',f'{model}','AUM']].copy()
    
    
    # CROSS-SECTIONALLY SORT BY ALPHAS #
    alphaSortedFlow_df['GROUP'] = alphaSortedFlow_df.groupby(['YEAR'])[f'{model}'].transform(lambda x: pd.qcut(x, q=3, labels = range(1,4)))
    
    
    # AGGREGATED PORTFOLIO FLOW TIME SERIES #
    alphaSortedFlow_df = alphaSortedFlow_df.groupby(['YEAR','GROUP']).apply(lambda x: np.average(x['FLOW'], weights=x['AUM']))
    alphaFLowDf_list.append(alphaSortedFlow_df)
    del model, alphaSortedFlow_df


# CONCATENATE DATA FRAMES #
alphaFlow_df = pd.concat(alphaFLowDf_list, axis=1)            
del alphaFLowDf_list
alphaFlow_df.columns = ['HF9', 'CAPM', 'FF5', 'FH7']


# RECEIVING DATA FRAME #
alphaFlow_out = pd.DataFrame(columns = ['HF9', 'CAPM', 'FF5', 'FH7'], 
                             index = [1, '1-Tstat', 2, '2-Tstat', 3, '3-Tstat', 
                                      'HighMinusLow', 'HighMinusLow-Tstat', 
                                      'Value', 'Value-Tstat', 'Sign', 'Sign-Tstat'])


# TIME SERIES MEAN AND T-STATISTICS #
for model in ['HF9', 'CAPM', 'FF5', 'FH7']:
    
    
    # EXTRACT MODEL INFERRED FLOWS #
    alphaFlow_model = alphaFlow_df[model].unstack()
    alphaFlow_model['HighMinusLow'] = alphaFlow_model[3] - alphaFlow_model[1]
    
    
    # TIME-SERIES MEAN AND T-STATISTIC #
    alphaFlow_out.loc[[1, 2, 3, 'HighMinusLow'], model] = alphaFlow_model.mean()
    alphaFlow_out.loc[['1-Tstat', '2-Tstat', '3-Tstat', 'HighMinusLow-Tstat'], model] = stats.ttest_1samp(alphaFlow_model, 0).statistic
    
    
    # HOUSE CLEAN #
    del model, alphaFlow_model
    
    
# HOUSE CLEAN #
del alphaFlow_df
    

# SENSITIVITY - VALUE ANALYSIS #
for mod in ['HF9', 'CAPM', 'FF5', 'FH7']:
    
    
    # DEFINE Y AND X #
    panelRegData = hfFlowAlpha_df.set_index(['FUNDID', 'YEAR'])
    y = panelRegData['FLOW']
    X = sm.add_constant(panelRegData[f'{mod}'])
    
    
    # PANEL OLS WITH CLUSTERED SE #
    model = PanelOLS(y, X, entity_effects=False, time_effects=False)
    clusters = {'FUNDID': panelRegData.index.get_level_values('FUNDID'), 
                'YEAR': panelRegData.index.get_level_values('YEAR')}
    results = model.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)
    alphaFlow_out.loc['Value', mod] = results.params[f'{mod}']
    alphaFlow_out.loc['Value-Tstat', mod] = results.tstats[f'{mod}']
    
    
    # HOUSE CLEAN #
    del mod, panelRegData, y, X, model, clusters, results


# SENSITIVITY - SIGN ANALYSIS #
for mod in ['HF9', 'CAPM', 'FF5', 'FH7']:
    
    
    # DEFINE Y AND X #
    panelRegData = hfFlowAlpha_df.set_index(['FUNDID', 'YEAR'])
    panelRegData['FLOW'] = np.where(panelRegData['FLOW']>0, 1, -1)
    panelRegData[f'{mod}'] = np.where(panelRegData[f'{mod}']>0, 1, -1)
    y = panelRegData['FLOW']
    X = sm.add_constant(panelRegData[f'{mod}'])
    
    
    # PANEL OLS WITH CLUSTERED SE #
    model = PanelOLS(y, X, entity_effects=False, time_effects=False)
    clusters = {'FUNDID': panelRegData.index.get_level_values('FUNDID'), 
                'YEAR': panelRegData.index.get_level_values('YEAR')}
    results = model.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)
    alphaFlow_out.loc['Sign', mod] = (results.params[f'{mod}'] + 1 ) / 2
    alphaFlow_out.loc['Sign-Tstat', mod] = results.tstats[f'{mod}']
    
    
    # HOUSE CLEAN #
    del mod, panelRegData, y, X, model, clusters, results


# OUTPUT RESULTS #
alphaFlow_out.to_excel(tabDir + '\\Table11.xlsx')


# HOUSE CLEAN #
del alphaFlow_out, factor_df, hfAlpha_df, hfExRet_df, hfFlow_df, hfFlowAlpha_df, rf_df






############################
#         TABLE 12         # 
############################


# IMPORT FUNCTION #
from lifelines import CoxTimeVaryingFitter


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
rf_df = factor_df.loc[:, ['MONTH','RF']].copy()
factor_df.drop(columns=['RF'], inplace=True)


# IMPORT FAMA-FRENCH FIVE FACTORS #
ff5factor_df = pd.read_excel('MONTH_FF5F.xlsx')
factor_df = pd.merge(left=factor_df, right=ff5factor_df, how='left', on=['MONTH'])
del ff5factor_df


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET','CAT','AUM','MFEE',
                               'IFEE','HWM','LCKP','NTC']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# REPLACE ZERO AUM WITH MISSING #
hfExRet_df['AUM'].replace(0, np.NaN, inplace=True)


# CHECK IF HF TABLE DO NOT SKIP MONTHS #
hfExRet_df['MONTH'] = pd.to_datetime(hfExRet_df['MONTH'].astype(str), 
                                     format='%Y%m')
def expand_id_months(group):
    id_ = group['FUNDID'].iloc[0]
    min_month = group['MONTH'].min()
    max_month = group['MONTH'].max()
    full_months = pd.date_range(start=min_month, end=max_month, freq='MS')
    full_group = pd.DataFrame({'FUNDID': id_, 'MONTH': full_months})
    merged = pd.merge(left=full_group, right=group, 
                      on=['FUNDID','MONTH'], how='left')
    return merged

hfExRet_df = hfExRet_df.groupby('FUNDID').apply(expand_id_months).reset_index(drop=True)
hfExRet_df['MONTH'] = hfExRet_df['MONTH'].dt.year*100 + hfExRet_df['MONTH'].dt.month


# ADVANCE NOTICE PERIODS IN MONTH # 
hfExRet_df['NTC'] = hfExRet_df['NTC'] / 30


# ADD HF AGE DATA TO DATA #
hfIncept_df = pd.read_stata('FUNDID_INCEPT.dta')
hfIncept_df['INCEPT'] = hfIncept_df['INCEPT'].astype(int)
hfExRet_df = pd.merge(left=hfExRet_df, right=hfIncept_df, how='left', on=['FUNDID'])
hfExRet_df['AGE'] = (hfExRet_df['MONTH']//100 - hfExRet_df['INCEPT']//100) + (hfExRet_df['MONTH']%100 - hfExRet_df['INCEPT']%100) / 12
hfExRet_df.drop(columns = 'INCEPT', inplace=True)
del hfIncept_df


# X-SECTIONAL WINSORIZED AT 1% AND 99% #
for col in ['AUM','AGE','MFEE','IFEE','HWM','LCKP','NTC']:
    
    hfColHL_df = pd.concat([hfExRet_df.groupby('MONTH')[col].quantile(q=0.99),
                            hfExRet_df.groupby('MONTH')[col].quantile(q=0.01)],
                           axis=1)
    hfColHL_df.columns = ['H','L']
    hfExRet_df = pd.merge(left=hfExRet_df, right=hfColHL_df, 
                          how='left', on='MONTH')
    del hfColHL_df
    
    hfExRet_df.loc[hfExRet_df[col]>hfExRet_df['H'], col] = hfExRet_df.loc[hfExRet_df[col]>hfExRet_df['H'], 'H']
    hfExRet_df.loc[hfExRet_df[col]<hfExRet_df['L'], col] = hfExRet_df.loc[hfExRet_df[col]<hfExRet_df['L'], 'L']
    hfExRet_df.drop(columns=['H','L'], inplace=True)
    del col


# CHECK SUMMARY STATISTICS #
temp = pd.DataFrame(columns=['Mean','SD','10%','25%','Median','75%','90%'],
                    index=['EXRET','AUM','AGE','MFEE','IFEE','HWM','LCKP','NTC'])
for var in temp.index:
    temp.loc[var, 'Mean'] = hfExRet_df[var].mean()
    temp.loc[var, 'SD'] = hfExRet_df[var].std()
    temp.loc[var, '10%'] = hfExRet_df[var].quantile(q=.1)
    temp.loc[var, '25%'] = hfExRet_df[var].quantile(q=.25)
    temp.loc[var, 'Median'] = hfExRet_df[var].quantile(q=.5)
    temp.loc[var, '75%'] = hfExRet_df[var].quantile(q=.75)
    temp.loc[var, '90%'] = hfExRet_df[var].quantile(q=.9)
    del var


# TAKE DECEMBER OBSERVATIONS AS FUND CHARACTERISTICS #
hfChar_df = hfExRet_df.loc[:, ['FUNDID','MONTH','AGE','AUM','MFEE','IFEE',
                               'HWM','LCKP','NTC','CAT']].copy()
hfChar_df['YEAR'] = hfChar_df['MONTH']//100
hfChar_df = hfChar_df.sort_values(by=['FUNDID','MONTH'], ignore_index=True)
hfChar_df = hfChar_df.groupby(['FUNDID','YEAR']).last().reset_index()
hfChar_df['YEAR'] = hfChar_df.groupby('FUNDID')['YEAR'].shift(-1)
hfChar_df = hfChar_df.dropna(subset=['FUNDID','YEAR'])


# YEARLY FLAG OF NEXT YEAR FAILURE # 
hfFailure_df = hfExRet_df.loc[:, ['FUNDID', 'MONTH']].copy()
hfFailure_df['YEAR'] = hfFailure_df['MONTH']//100
hfFailure_df.loc[hfFailure_df['MONTH']>=201907, 'YEAR'] = 2020# DATA ENDS WITH 201908 BUT USE 201907 TO MAINTAIN SAME LEVEL OF # FAILURES
hfFailure_df = hfFailure_df.loc[:, ['FUNDID', 'YEAR']].drop_duplicates()
hfFailure_df = hfFailure_df.sort_values(by=['FUNDID','YEAR'], ignore_index=True)
hfFailure_df = hfFailure_df.groupby(by='FUNDID').last().reset_index()
hfFailure_df['FAILURE'] = 1


    ######################################
    # FUND ALPHA WITH 24 ROLLING WINDOWS #
    ######################################


# MERGE RETURNS WITH FACTORS #
merged_df = pd.merge(left=hfExRet_df, right=factor_df, how='left', on='MONTH')
merged_df = merged_df.sort_values(['FUNDID','MONTH'])


# FUNCTION FOR ROLLING REGRESSION #
def rolling_alpha(group, xList, suffix, window=24, min_periods=24):
    
    
    # RECEIVING LIST FOR ALPHAS #
    alphas = [np.nan] * len(group)
    months = group['MONTH'].reset_index(drop=True)
    
    
    # LOOP OVER DATA #
    for idx in range(len(group)):
        currentMonth = months[idx]
        
        
        # CHECK IF CURRENT MONTH IS JANUARY #
        if currentMonth % 100 == 1:
            
            
            # CHECK IF ENOUGH DATA FOR REGRESSION #
            if idx >= window: 
                
                
                # ROLLING WINDOW SLICE #
                window_slice = group.iloc[idx - window:idx, :].copy()
                
                
                # CHECK FOR MINIMUM NON-MISSING OBSERVATIONS #
                if window_slice['EXRET'].dropna().shape[0] >= min_periods:
                    
                    
                    # DEFINE Y AND XS #
                    Y = window_slice['EXRET'].copy()
                    X = sm.add_constant(window_slice.loc[:, xList])
                    
                    
                    # ALPHA ESTIMATES #
                    model = sm.OLS(Y, X, missing='drop').fit()
                    alpha = model.params['const']
                    alphas[idx] = alpha 
                    
                    
                    # HOUSE CLEAN #
                    del Y, X, model, alpha
            
                # HOUSE CLEAN #
                del window_slice
                
        # HOUSE CLEAN #
        del idx, currentMonth
        
        
    # ASSIGN ALPHAS BACK TO GROUP #
    group = group.reset_index(drop=True)
    group[f'{suffix}'] = alphas
    return group


# ROLLING WINDOW ALPHAS #
hfAlpha_df = merged_df.loc[:, ['FUNDID','MONTH']].copy()
# HF9
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
temp = merged_df.groupby('FUNDID').apply(rolling_alpha, xList=XNAMELIST, suffix='HF9').reset_index(drop=True)
hfAlpha_df['HF9'] = temp['HF9'].copy()
del XNAMELIST, temp
XNAMELIST = ['MKT']
temp = merged_df.groupby('FUNDID').apply(rolling_alpha, xList=XNAMELIST, suffix='CAPM').reset_index(drop=True)
hfAlpha_df['CAPM'] = temp['CAPM'].copy()
del XNAMELIST, temp
XNAMELIST = ['FF5F_MKT', 'FF5F_SMB', 'FF5F_HML', 'FF5F_RMW', 'FF5F_CMA']
temp = merged_df.groupby('FUNDID').apply(rolling_alpha, xList=XNAMELIST, suffix='FF5').reset_index(drop=True)
hfAlpha_df['FF5'] = temp['FF5'].copy()
del XNAMELIST, temp
XNAMELIST = ['MKT', 'SIZE', 'BDTF', 'CMTF', 'FXTF', '10Y', 'CRDT']
temp = merged_df.groupby('FUNDID').apply(rolling_alpha, xList=XNAMELIST, suffix='FH7').reset_index(drop=True)
hfAlpha_df['FH7'] = temp['FH7'].copy()
del XNAMELIST, temp


# ONLY KEEP YEARLY OBSERVATIONS #
hfAlpha_df['YEAR'] = hfAlpha_df['MONTH'] // 100
hfAlpha_df['MONTH'] = hfAlpha_df['MONTH'] % 100
hfAlpha_df = hfAlpha_df.loc[hfAlpha_df['MONTH']==1, :].copy()
hfAlpha_df.drop(columns=['MONTH'], inplace=True)
hfAlpha_df.dropna(inplace=True)


# HOUSE CLEAN #
del merged_df


    ###############################################
    # FUND FAILURE MERGE WITH ALPHAS AND CONTROLS #
    ###############################################


# MERGE FAILURE INDICATOR WITH ALPHAS #
hfFailureAlpha_df = pd.merge(left=hfAlpha_df, right=hfFailure_df, 
                             how='left', on=['FUNDID', 'YEAR'])
hfFailureAlpha_df['FAILURE'] = hfFailureAlpha_df['FAILURE'].fillna(0)


# MERGE WITH FUND CONTROLS #
hfFailureAlphaCtrl_df = pd.merge(left=hfFailureAlpha_df, right=hfChar_df, 
                                 how='left', on=['FUNDID', 'YEAR'])


# LOG(AGE) #
hfFailureAlphaCtrl_df['LAGE'] = np.log(hfFailureAlphaCtrl_df['AGE'])


# LOG(FUND SIZE) #
hfFailureAlphaCtrl_df['LSIZE'] = np.log(hfFailureAlphaCtrl_df['AUM'])


# CATEGORY DUMMIES #
hfFailureAlphaCtrl_df = pd.get_dummies(hfFailureAlphaCtrl_df, prefix='CAT', 
                                       columns=['CAT'], drop_first=True)
hfFailureAlphaCtrl_df = hfFailureAlphaCtrl_df.dropna()


    ################################
    # FAILURE-PERFORMANCE ANALYSIS #
    ################################


# RECEIVING DATA FRAME #
alphaFailure_out = pd.DataFrame(columns = ['HF9', 'CAPM', 'FF5', 'FH7'], 
                             index = ['LOW ALPHA', 'LOW ALPHA-t',
                                      'HIGH ALPHA', 'HIGH ALPHA-t',
                                      'LAGE', 'LAGE-t', 'LSIZE', 'LSIZE-t', 
                                      'MFEE', 'MFEE-t', 'IFEE', 'IFEE-t', 
                                      'HWM', 'HWM-t', 'LCKP', 'LCKP-t',
                                      'NTC', 'NTC-t'])


# PROPORTIONAL HAZARD ANALYSIS #
for model in ['HF9', 'CAPM', 'FF5', 'FH7']:
    
    
    # EXTRACT FAILURE AND MODEL ALPHAS #
    alphaFailure_model = hfFailureAlphaCtrl_df.loc[:, ['FUNDID','YEAR','FAILURE',f'{model}', 
                                                       'LAGE','LSIZE','MFEE','IFEE',
                                                       'HWM','LCKP','NTC',
                                                       'CAT_DSB','CAT_ED','CAT_EM', 
                                                       'CAT_EMN','CAT_FIA','CAT_FOF','CAT_GM', 
                                                       'CAT_LSEH','CAT_MS']].copy()
    
    
    # CROSS-SECTIONALLY SORT BY ALPHAS #
    alphaFailure_model['ALPHA'] = alphaFailure_model.groupby(['YEAR'])[f'{model}'].transform(lambda x: pd.qcut(x, q=3, labels = range(1,4)))
    alphaFailure_model['LOW ALPHA'] = np.where(alphaFailure_model['ALPHA']==1, 1, 0)
    alphaFailure_model['HIGH ALPHA'] = np.where(alphaFailure_model['ALPHA']==3, 1, 0)
    
    
    # PREPARE DATA FOR COX TIME VARYING FITTER #
    alphaFailure_model['START'] = alphaFailure_model['YEAR'] - 1
    alphaFailure_model['STOP'] = alphaFailure_model['YEAR']
    alphaFailure_model = alphaFailure_model.loc[:, ['FUNDID','START','STOP','FAILURE',
                                                    'LOW ALPHA','HIGH ALPHA',
                                                    'LAGE','LSIZE','MFEE','IFEE',
                                                    'HWM','LCKP','NTC',
                                                    'CAT_DSB','CAT_ED','CAT_EM',
                                                    'CAT_EMN','CAT_FIA','CAT_FOF','CAT_GM', 
                                                    'CAT_LSEH','CAT_MS']].dropna()
    
    ctv = CoxTimeVaryingFitter()
    ctv.fit(alphaFailure_model, id_col='FUNDID', start_col='START',
            stop_col='STOP', event_col='FAILURE')
    
    
    # OUTPUT RESULTS #
    expCoefList = list()
    zscoreList = list()
    for var in ['LOW ALPHA','HIGH ALPHA','LAGE','LSIZE','MFEE','IFEE','HWM','LCKP','NTC']:
        
        
        expCoef_var = ctv.summary.loc[var, 'exp(coef)']
        zscore_var = ctv.summary.loc[var, 'z']
        expCoefList.append(f'{expCoef_var:.2f}')
        zscoreList.append(f'({zscore_var:.2f})')
        
        del var, expCoef_var, zscore_var
        
    alphaFailure_out.loc[['LOW ALPHA','HIGH ALPHA','LAGE','LSIZE','MFEE','IFEE','HWM','LCKP','NTC'], model] = expCoefList
    alphaFailure_out.loc[['LOW ALPHA-t','HIGH ALPHA-t','LAGE-t','LSIZE-t','MFEE-t','IFEE-t','HWM-t','LCKP-t','NTC-t'], model] = zscoreList
    del model, alphaFailure_model, ctv, expCoefList, zscoreList


# OUTPUT RESULTS #
alphaFailure_out.loc['STYLE DUMMIES',:] = ['Yes','Yes','Yes','Yes']
alphaFailure_out.to_excel(tabDir + '/Table12.xlsx')


# HOUSE CLEAN #
del alphaFailure_out, factor_df, hfAlpha_df, hfChar_df, hfExRet_df, hfFailure_df
del hfFailureAlpha_df, hfFailureAlphaCtrl_df, rf_df






############################
#         TABLE 13         # 
############################


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
factor_df.drop(columns=['RF'], inplace=True)


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# EXTRACT NEEDED FUND CHARS #
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET','CAT', 
                               'AUM','MIN','MFEE','IFEE','HWM','LCKP','NTC']].copy()


# GET STRATEGY CATEGORY MAPPING & USE AS RECEIVING DATA FRAME #
out_df = hfExRet_df.loc[:,['FUNDID', 'CAT']].drop_duplicates(subset=['FUNDID'], keep='last')


    ##################################################
    # HEDGE FUND ALPHA DISTRIBUTION ESTIMATION - HF9 #
    ##################################################


# 9 FACTOR LIST #
xNameList = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# LIST FOR BOOTSTRAP DATA FRAMES OF BETA * X & RESIDUAL RECEIVING #
bootDf_list = list()

# FUND LIST TO LOOP #
fundList = list(hfExRet_df['FUNDID'].unique())


# LOOP THROUGH FUNDS #
for fund in fundList:
    
    
    # DISPLAY STATUS TO USER #
    print(f'Hedge Fund: {fund}.')
    
    
    # TIME-SERIES REGRESSION DATA #
    regData = pd.merge(left=hfExRet_df.loc[hfExRet_df['FUNDID']==fund, ['MONTH', 'EXRET']], 
                       right=factor_df.loc[:,['MONTH']+xNameList], 
                       how='left', on='MONTH')
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    #print(factorModel.summary())
    
    
    # ALPHA AND T-STATISTIC #
    out_df.loc[out_df['FUNDID']==fund, 'Alpha'] = factorModel.params[0]
    out_df.loc[out_df['FUNDID']==fund, 'Tstat'] = factorModel.tvalues[0]
    
    
    # BETA * X AS FITTED VALUE - ALPHA & RESIDUALS #
    regData['FUNDID'] = fund
    regData['BETAX'] = factorModel.fittedvalues - factorModel.params[0]
    regData['RESID'] = factorModel.resid
    bootDf_list.append(regData.loc[:,['FUNDID', 'MONTH','BETAX','RESID']].copy())
    
    
    # HOUSE CLEAN #
    del fund, regData, factorModel


# CONCATENATE BOOTSTRAP DATA FRAMES #
boot_df = pd.concat(bootDf_list, ignore_index=True)
del bootDf_list


# HOUSE CLEAN #
del fundList


    #####################################################
    # BOOTSTAP ANALYSIS OF THE BEST & WORST ALPHA FUNDS #
    #####################################################


# RECEIVING DATA FRAME #
catList = list(hfExRet_df['CAT'].unique())
catList.sort()
indexList = ['All Funds', 'All Funds_p']
for cat in catList:
    indexList.append(cat)
    indexList.append(cat+'_p')
    del cat 
topBottom_df = pd.DataFrame(index=indexList, columns=range(1, 15))
del indexList


        #############
        # ALL FUNDS #
        #############
    

# TOP AND BOTTOM 15 FUNDS BASED ON TSTAT #
fundList = out_df.sort_values(by=['Tstat'])
fundList = pd.concat([fundList.head(15), fundList.tail(15)], ignore_index=True)


# TOP AND BOTTOM 1-5, 10, 15 FUNDS #
fundList = fundList.iloc[[0,1,2,3,4,9,14,15,20,25,26,27,28,29]].copy()


# RECEIVING DATA FRAME FOR TOP AND BOTTOM 15 FUNDS #
bootT_df = pd.DataFrame(columns=fundList['FUNDID'].unique(), index=range(10000))


# LOOP THROUGH EACH FUND #
for i, fund in enumerate(bootT_df.columns):
    
    
    # DISPLAY STATUS TO USER #
    print(f'Fund #{i+1} out of 14: Seed 1 ==>', end = ' ')
    
    
    # LOOP THROUGH RANDOM SEEDS #
    for seed in range(10000):
        
        
        # DISPLAY STATUS TO USER #
        if ( seed + 1 ) % 1000 == 0:
            
            print(seed+1, end=' ==> ')
        
        # SAMPLE RESIDUALS WITH REPLACEMENT #
        regData_seed = boot_df.loc[boot_df['FUNDID']==fund, :].copy()
        regData_seed['RESID'] = regData_seed['RESID'].sample(frac=1, replace=True, random_state=seed).values
        
        
        # BOOTSTRAPPED HEDGE FUND RETURNS UNDER NULL HYPOTHESIS ALPHA = 0 #
        regData_seed['EXRET'] = regData_seed['BETAX'].copy() + regData_seed['RESID'].copy()
        
        
        # TIME-SERIES REGRESSION DATA #
        regData_seed = pd.merge(left=regData_seed.loc[:, ['MONTH', 'EXRET']], 
                                right=factor_df.loc[:,['MONTH']+xNameList], 
                                how='left', on='MONTH')
    
    
        # TIME-SERIES REGRESSION #
        factorModel = sm.OLS(regData_seed['EXRET'], 
                             sm.add_constant(regData_seed.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
        #print(factorModel.summary())
    
    
        # BOOTSTRAPPED T-STATISTIC #
        bootT_df.loc[seed, fund] = factorModel.tvalues[0]
        
        
        # HOUSE CLEAN #
        del seed, regData_seed, factorModel
    
    
    # DISPLAY STATUS TO USER #
    print('done.')
    
    
    # HOUSE CLEAN #
    del i, fund 
    

# EMPIRICAL P-VALUE #
# P(TB > T-STAT) FOR T-STAT > 0 #
# 1 - P(TB > T-STAT) FOR T-STAT < 0 #
for i, fund in enumerate(bootT_df.columns):
    
    
    # GET T-STATISTIC FOR FUND ALPHA #
    tStat_fund = float(out_df.loc[out_df['FUNDID']==fund, 'Tstat'])
    topBottom_df.loc['All Funds', i+1] = tStat_fund
    
    
    # EMPIRICAL P-VALUE #
    if tStat_fund > 0:    
        topBottom_df.loc['All Funds_p', i+1] = (bootT_df[fund] > tStat_fund).sum() / 10000
    else:
        topBottom_df.loc['All Funds_p', i+1] = 1 - (bootT_df[fund] > tStat_fund).sum() / 10000
        
    
    # HOUSE CLEAN #
    del i, fund, tStat_fund
    

# HOUSE CLEAN #
del fundList, bootT_df
    
    
        ##################
        # CATEGORY LEVEL #
        ##################
    
    
# LOOP THROUGH CATEGORY #
for cat in catList:
    
    
    # DISPLAY STATUS TO USER #
    print(f'HEDGE FUND CATEGORY: {cat}.')
    
    
    # ONLY EXTRACT CATEGORY #
    outDf_cat = out_df.loc[out_df['CAT']==cat,:].copy()
    
    
    # TOP AND BOTTOM 15 FUNDS BASED ON ALPHA #
    fundList = outDf_cat.sort_values(by=['Tstat'])
    fundList = pd.concat([fundList.head(15), fundList.tail(15)], ignore_index=True)


    # TOP AND BOTTOM 1-5, 10, 15 FUNDS #
    fundList = fundList.iloc[[0,1,2,3,4,9,14,15,20,25,26,27,28,29]].copy()


    # RECEIVING DATA FRAME FOR TOP AND BOTTOM 15 FUNDS #
    bootT_df = pd.DataFrame(columns=fundList['FUNDID'].unique(), index=range(10000))
    
    
    # LOOP THROUGH EACH FUND #
    for i, fund in enumerate(bootT_df.columns):
        
        
        # DISPLAY STATUS TO USER #
        print(f'Fund #{i+1} out of 14: Seed 1 ==>', end = ' ')
        
        
        # LOOP THROUGH RANDOM SEEDS #
        for seed in range(10000):
            
            
            # DISPLAY STATUS TO USER #
            if ( seed + 1 ) % 1000 == 0:
                
                print(seed+1, end=' ==> ')
                
            # SAMPLE RESIDUALS WITH REPLACEMENT #
            regData_seed = boot_df.loc[boot_df['FUNDID']==fund, :].copy()
            regData_seed['RESID'] = regData_seed['RESID'].sample(frac=1, replace=True, random_state=seed).values
            
            
            # BOOTSTRAPPED HEDGE FUND RETURNS UNDER NULL HYPOTHESIS ALPHA = 0 #
            regData_seed['EXRET'] = regData_seed['BETAX'].copy() + regData_seed['RESID'].copy()
            
            
            # TIME-SERIES REGRESSION DATA #
            regData_seed = pd.merge(left=regData_seed.loc[:, ['MONTH', 'EXRET']], 
                                right=factor_df.loc[:,['MONTH']+xNameList], 
                                how='left', on='MONTH')
            
            
            # TIME-SERIES REGRESSION #
            factorModel = sm.OLS(regData_seed['EXRET'], 
                             sm.add_constant(regData_seed.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
            #print(factorModel.summary())
            
            
            # BOOTSTRAPPED T-STATISTIC #
            bootT_df.loc[seed, fund] = factorModel.tvalues[0]
            
            
            # HOUSE CLEAN #
            del seed, regData_seed, factorModel
            
            
        # DISPLAY STATUS TO USER #
        print('done.')
        
        
        # HOUSE CLEAN #
        del i, fund 
        
        
    # EMPIRICAL P-VALUE #
    for i, fund in enumerate(bootT_df.columns):
        
        
        # GET T-STATISTIC FOR FUND ALPHA #
        tStat_fund = float(out_df.loc[out_df['FUNDID']==fund, 'Tstat'])
        topBottom_df.loc[f'{cat}', i+1] = tStat_fund
        
        
        # EMPIRICAL P-VALUE #
        # P(TB > T-STAT) FOR T-STAT > 0 #
        # 1 - P(TB > T-STAT) FOR T-STAT < 0 #
        if tStat_fund > 0:
            topBottom_df.loc[f'{cat}_p', i+1] = (bootT_df[fund] > tStat_fund).sum() / 10000
        else:
            topBottom_df.loc[f'{cat}_p', i+1] = 1 - (bootT_df[fund] > tStat_fund).sum() / 10000
            
        
        # HOUSE CLEAN #
        del i, fund, tStat_fund
        
        
    # HOUSE CLEAN #
    del cat, outDf_cat, fundList, bootT_df
        

# BOOTSTRAP OUTPUT #
topBottom_df.columns = ['Bottom','2.min','3.min','4.min','5.min', 
                        '10.min','15.min','15.max','10.max',
                        '5.max','4.max','3.max','2.max','Top']


# OUTPUT RESULT TABLE #
topBottom_df.to_excel(tabDir + '\\Table13.xlsx')


# HOUSE CLEAN #
del boot_df, catList, factor_df, hfExRet_df, out_df, topBottom_df, xNameList






############################
#         FIGURE 4         # 
############################


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
factor_df.drop(columns=['RF'], inplace=True)


# HEDGE FUND EXCESS RETURNS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# EXTRACT FUND EXCESS RETURNS AND CATEGORY #
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET','CAT']].copy()


# GET STRATEGY CATEGORY MAPPING & USE AS RECEIVING DATA FRAME #
out_df = hfExRet_df.loc[:,['FUNDID', 'CAT']].drop_duplicates(subset=['FUNDID'], keep='last')


    ##################################################
    # HEDGE FUND ALPHA DISTRIBUTION ESTIMATION - HF9 #
    ##################################################


# 9 FACTOR LIST #
xNameList = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']


# LIST FOR BOOTSTRAP DATA FRAMES OF BETA * X & RESIDUAL RECEIVING #
bootDf_list = list()

# FUND LIST TO LOOP #
fundList = list(hfExRet_df['FUNDID'].unique())


# LOOP THROUGH FUNDS #
for fund in fundList:
    
    
    # DISPLAY STATUS TO USER #
    print(f'Hedge Fund: {fund}.')
    
    
    # TIME-SERIES REGRESSION DATA #
    regData = pd.merge(left=hfExRet_df.loc[hfExRet_df['FUNDID']==fund, ['MONTH', 'EXRET']], 
                       right=factor_df.loc[:,['MONTH']+xNameList], 
                       how='left', on='MONTH')
    
    
    # TIME-SERIES REGRESSION #
    factorModel = sm.OLS(regData['EXRET'], sm.add_constant(regData.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
    #print(factorModel.summary())
    
    
    # ALPHA AND T-STATISTIC #
    out_df.loc[out_df['FUNDID']==fund, 'Alpha'] = factorModel.params[0]
    out_df.loc[out_df['FUNDID']==fund, 'Tstat'] = factorModel.tvalues[0]
    
    
    # BETA * X AS FITTED VALUE - ALPHA & RESIDUALS #
    regData['FUNDID'] = fund
    regData['BETAX'] = factorModel.fittedvalues - factorModel.params[0]
    regData['RESID'] = factorModel.resid
    bootDf_list.append(regData.loc[:,['FUNDID', 'MONTH','BETAX','RESID']].copy())
    
    
    # HOUSE CLEAN #
    del fund, regData, factorModel


# CONCATENATE BOOTSTRAP DATA FRAMES #
boot_df = pd.concat(bootDf_list, ignore_index=True)
del bootDf_list


# HOUSE CLEAN #
del fundList


    #####################################################
    # BOOTSTAP ANALYSIS OF THE BEST & WORST ALPHA FUNDS #
    #####################################################


# RECEIVING DATA FRAME #
topBottom_df = pd.DataFrame(index=['All Funds', 'All Funds_p'], 
                            columns=range(1, 11))
    

# TOP AND BOTTOM FUNDS BASED ON TSTAT #
fundList = out_df.sort_values(by=['Tstat'], ignore_index=True)
numFunds = fundList.shape[0]
indexList = [np.ceil(numFunds*0.01), np.ceil(numFunds*0.02), 
             np.ceil(numFunds*0.03), np.ceil(numFunds*0.05), 
             np.ceil(numFunds*0.1), np.floor(numFunds*0.9),
             np.floor(numFunds*0.95), np.floor(numFunds*0.97),
             np.floor(numFunds*0.98), np.floor(numFunds*0.99)]
fundList = fundList.iloc[indexList, :].reset_index(drop=True)


# RECEIVING DATA FRAME FOR TOP AND BOTTOM FUNDS #
bootT_df = pd.DataFrame(columns=fundList['FUNDID'].unique(), index=range(10000))


# LOOP THROUGH EACH FUND #
for i, fund in enumerate(bootT_df.columns):
    
    
    # DISPLAY STATUS TO USER #
    print(f'Fund #{i+1} out of 10: Seed 1 ==>', end = ' ')
    
    
    # LOOP THROUGH RANDOM SEEDS #
    for seed in range(10000):
        
        
        # DISPLAY STATUS TO USER #
        if ( seed + 1 ) % 1000 == 0:
            
            print(seed+1, end=' ==> ')
        
        # SAMPLE RESIDUALS WITH REPLACEMENT #
        regData_seed = boot_df.loc[boot_df['FUNDID']==fund, :].copy()
        regData_seed['RESID'] = regData_seed['RESID'].sample(frac=1, replace=True, random_state=seed).values
        
        
        # BOOTSTRAPPED HEDGE FUND RETURNS UNDER NULL HYPOTHESIS ALPHA = 0 #
        regData_seed['EXRET'] = regData_seed['BETAX'].copy() + regData_seed['RESID'].copy()
        
        
        # TIME-SERIES REGRESSION DATA #
        regData_seed = pd.merge(left=regData_seed.loc[:, ['MONTH', 'EXRET']], 
                                right=factor_df.loc[:,['MONTH']+xNameList], 
                                how='left', on='MONTH')
    
    
        # TIME-SERIES REGRESSION #
        factorModel = sm.OLS(regData_seed['EXRET'], 
                             sm.add_constant(regData_seed.loc[:, xNameList])).fit(cov_type='HAC',cov_kwds={'maxlags':3})
        #print(factorModel.summary())
    
    
        # BOOTSTRAPPED T-STATISTIC #
        bootT_df.loc[seed, fund] = factorModel.tvalues[0]
        
        
        # HOUSE CLEAN #
        del seed, regData_seed, factorModel
    
    
    # DISPLAY STATUS TO USER #
    print('done.')
    
    
    # HOUSE CLEAN #
    del i, fund 
    

# PLOT PARAMETERS #
font = {'size'   : 16}
matplotlib.rc('font', **font)
    

# EMPIRICAL P-VALUE #
# P(TB > T-STAT) FOR T-STAT > 0 #
# 1 - P(TB > T-STAT) FOR T-STAT < 0 #
for i, fund in enumerate(bootT_df.columns):
    
    
    # GET T-STATISTIC FOR FUND ALPHA #
    tStat_fund = float(out_df.loc[out_df['FUNDID']==fund, 'Tstat'])
    topBottom_df.loc['All Funds', i+1] = tStat_fund
    
    
    # EMPIRICAL P-VALUE #
    if tStat_fund > 0:    
        topBottom_df.loc['All Funds_p', i+1] = (bootT_df[fund] > tStat_fund).sum() / 10000
    else:
        topBottom_df.loc['All Funds_p', i+1] = 1 - (bootT_df[fund] > tStat_fund).sum() / 10000
    
    
    # DENSITY PLOT #
    fig, ax = plt.subplots(figsize=(10, 5))

    sns.kdeplot(data=bootT_df[fund], linestyle='solid', 
                linewidth=1, fill=True, color='dimgray')
    plt.ylim(top=0.4)
    plt.xlim(left=-6, right=6)
    plt.xlabel('t-statistic')
    plt.ylabel('Kernel density')

    plt.axvline(x=tStat_fund, color='darkblue', linewidth=2)
    plt.savefig(figDir+f'\\Figure4_Panel{i+1}.png')
    plt.close()
    
    
    # HOUSE CLEAN #
    del i, fund, tStat_fund, fig, ax
    

# HOUSE CLEAN #
del boot_df, factor_df, fundList, hfExRet_df, indexList, numFunds, out_df, topBottom_df, xNameList, bootT_df






############################
#         TABLE 14         # 
############################


# SET UP STATA CONFIGURATION #
import stata_setup
stataConfig = 'D:\\Program Files\\Stata18\\'
stataEdition = 'se'
stata_setup.config('D:\\Program Files\\Stata18', 'se')
from pystata import stata


# IMPORT FACTORS # 
factor_df = pd.read_excel('MONTH_FACTOR.xlsx')
factor_df.drop(columns=['RF'], inplace=True)


# HEDGE FUND EXCESS RETURNS AND CHARS #
#hfExRet_df = pd.read_pickle('FUNDID_MONTH_EXRET_HFCHARS_199701_201908.pkl')
hfExRet_df = pd.read_pickle('PSEUDODATA1.pkl')
hfExRet_df = hfExRet_df.loc[:,['FUNDID','MONTH','EXRET','CAT','AUM',
                               'MFEE','IFEE','HWM','LCKP','NTC']].copy()
hfExRet_df.dropna(subset=['EXRET'], inplace=True)
# 713,737 fund-month observations; 7,314 unique funds


# REPLACE ZERO AUM WITH MISSING #
hfExRet_df['AUM'].replace(0, np.NaN, inplace=True)


# CHECK IF HF TABLE DO NOT SKIP MONTHS #
hfExRet_df['MONTH'] = pd.to_datetime(hfExRet_df['MONTH'].astype(str), 
                                     format='%Y%m')
def expand_id_months(group):
    id_ = group['FUNDID'].iloc[0]
    min_month = group['MONTH'].min()
    max_month = group['MONTH'].max()
    full_months = pd.date_range(start=min_month, end=max_month, freq='MS')
    full_group = pd.DataFrame({'FUNDID': id_, 'MONTH': full_months})
    merged = pd.merge(left=full_group, right=group, 
                      on=['FUNDID','MONTH'], how='left')
    return merged

hfExRet_df = hfExRet_df.groupby('FUNDID').apply(expand_id_months).reset_index(drop=True)
hfExRet_df['MONTH'] = hfExRet_df['MONTH'].dt.year*100 + hfExRet_df['MONTH'].dt.month 


# ADVANCE NOTICE PERIODS IN MONTH # 
hfExRet_df['NTC'] = hfExRet_df['NTC'] / 30


# ADD HF AGE DATA TO DATA #
hfIncept_df = pd.read_stata('FUNDID_INCEPT.dta')
hfIncept_df['INCEPT'] = hfIncept_df['INCEPT'].astype(int)
hfExRet_df = pd.merge(left=hfExRet_df, right=hfIncept_df, how='left', on=['FUNDID'])
hfExRet_df['AGE'] = (hfExRet_df['MONTH']//100 - hfExRet_df['INCEPT']//100) + (hfExRet_df['MONTH']%100 - hfExRet_df['INCEPT']%100) / 12
hfExRet_df.drop(columns = 'INCEPT', inplace=True)
del hfIncept_df


# X-SECTIONAL WINSORIZED AT 1% AND 99% #
for col in ['AUM','AGE','MFEE','IFEE','HWM','LCKP','NTC']:
    
    hfColHL_df = pd.concat([hfExRet_df.groupby('MONTH')[col].quantile(q=0.99),
                            hfExRet_df.groupby('MONTH')[col].quantile(q=0.01)],
                           axis=1)
    hfColHL_df.columns = ['H','L']
    hfExRet_df = pd.merge(left=hfExRet_df, right=hfColHL_df, 
                          how='left', on='MONTH')
    del hfColHL_df
    
    hfExRet_df.loc[hfExRet_df[col]>hfExRet_df['H'], col] = hfExRet_df.loc[hfExRet_df[col]>hfExRet_df['H'], 'H']
    hfExRet_df.loc[hfExRet_df[col]<hfExRet_df['L'], col] = hfExRet_df.loc[hfExRet_df[col]<hfExRet_df['L'], 'L']
    hfExRet_df.drop(columns=['H','L'], inplace=True)
    del col


# CHECK SUMMARY STATISTICS #
temp = pd.DataFrame(columns=['Mean','SD','10%','25%','Median','75%','90%'],
                    index=['EXRET','AUM','AGE','MFEE','IFEE','HWM','LCKP','NTC'])
for var in temp.index:
    temp.loc[var, 'Mean'] = hfExRet_df[var].mean()
    temp.loc[var, 'SD'] = hfExRet_df[var].std()
    temp.loc[var, '10%'] = hfExRet_df[var].quantile(q=.1)
    temp.loc[var, '25%'] = hfExRet_df[var].quantile(q=.25)
    temp.loc[var, 'Median'] = hfExRet_df[var].quantile(q=.5)
    temp.loc[var, '75%'] = hfExRet_df[var].quantile(q=.75)
    temp.loc[var, '90%'] = hfExRet_df[var].quantile(q=.9)
    del var


    ################################################
    # FUND ALPHA WITH BETA FROM 24 ROLLING WINDOWS #
    ################################################


# MERGE RETURNS WITH FACTORS #
merged_df = pd.merge(left=hfExRet_df, right=factor_df, how='left', on='MONTH')
merged_df = merged_df.sort_values(['FUNDID','MONTH'])


# FUNCTION FOR ROLLING REGRESSION #
def rolling_beta_alpha(group, xList, suffix, window=24, min_periods=24):
    
    
    # RECEIVING LIST FOR ALPHAS #
    alphas = [np.nan] * len(group)
    
    
    # LOOP OVER DATA #
    for idx in range(len(group)):
        
        # CHECK IF ENOUGH DATA FOR REGRESSION #
        if idx >= window: 
            
            
            # ROLLING WINDOW SLICE #
            window_slice = group.iloc[idx - window:idx, :].copy()
            
            
            # CHECK FOR MINIMUM NON-MISSING OBSERVATIONS #
            if window_slice['EXRET'].dropna().shape[0] >= min_periods:
                
                
                # DEFINE Y AND XS #
                Y = window_slice['EXRET'].copy()
                X = sm.add_constant(window_slice.loc[:, xList])
                
                
                # ALPHA ESTIMATES #
                model = sm.OLS(Y, X, missing='drop').fit()
                
                
                # ALPHA = RETURN - BETA * FACTOR
                betas = model.params.iloc[1:]
                predict_month = group.iloc[[idx], :].copy()
                Y_month = predict_month['EXRET'].copy()
                X_month = predict_month.loc[:, xList].copy()
                alphas[idx] = float(Y_month - X_month @ betas)
                
                
                # HOUSE CLEAN #
                del Y, X, model, betas, predict_month, Y_month, X_month
            
            
            # HOUSE CLEAN #
            del window_slice
            
        
        # HOUSE CLEAN #
        del idx
    
    
    # ASSIGN ALPHAS BACK TO GROUP #
    group = group.reset_index(drop=True)
    group[f'{suffix}'] = alphas
    return group[['FUNDID','MONTH',f'{suffix}']]


# ROLLING WINDOW ALPHAS #
hfAlpha_df = merged_df.loc[:, ['FUNDID','MONTH']].copy()
XNAMELIST = ['AGR', 'BAB', 'LRSK', 'MKT', 'ROA', 'TSMOM', '10Y', 'CRDT', 'TRM']
temp = merged_df.groupby('FUNDID').apply(rolling_beta_alpha, xList=XNAMELIST, suffix='HF9').reset_index(drop=True)
hfAlpha_df['HF9'] = temp['HF9'].copy()
del XNAMELIST, temp


# HOUSE CLEAN #
del merged_df


    ###########################
    # FAMA-MACBETH REGRESSION #
    ###########################


# FUND-MONTH LEVEL CONTROLS #
hfAlphaChar_df = hfAlpha_df.dropna()
hfChar = hfExRet_df.loc[:, ['FUNDID', 'MONTH', 
                            'AGE','AUM','MFEE','IFEE','HWM',
                            'LCKP','NTC','CAT']].copy()
hfChar['MONTH'] = hfChar.groupby('FUNDID')['MONTH'].shift(-1)
hfAlphaChar_df = pd.merge(left=hfAlphaChar_df, right=hfChar, 
                          how='inner', on=['FUNDID','MONTH'])


# LOG(AGE) #
hfAlphaChar_df['LAGE'] = np.log(hfAlphaChar_df['AGE'])


# LOG(FUND SIZE) #
hfAlphaChar_df['LSIZE'] = np.log(hfAlphaChar_df['AUM'])


# CATEGORY DUMMIES #
hfAlphaChar_df = pd.get_dummies(hfAlphaChar_df, prefix='CAT', 
                                columns=['CAT'], drop_first=True)
hfAlphaChar_df = hfAlphaChar_df.dropna()


# LOAD IN STATA MEMORY #
stata.pdataframe_to_data(hfAlphaChar_df)


# RECEIVING DATA FRAME FOR FAMA-MECBETH REGRESSION RESULTS #
fmOut_df = pd.DataFrame(columns=['(1)','(2)', '(3)'], 
                        index=['INTERCEPT','INTERCEPT-t', 
                               'LAGE','LAGE-t','LSIZE','LSIZE-t',
                               'MFEE','MFEE-t','IFEE','IFEE-t',
                               'HWM','HWM-t','LCKP','LCKP-t',
                               'NTC','NTC-t','CAT_DSB','CAT_DSB-t',
                               'CAT_ED','CAT_ED-t','CAT_EM','CAT_EM-t',
                               'CAT_EMN','CAT_EMN-t','CAT_FIA','CAT_FIA-t',
                               'CAT_FOF','CAT_FOF-t','CAT_GM','CAT_GM-t',
                               'CAT_LSEH','CAT_LSEH-t','CAT_MS','CAT_MS-t',
                               'Adj-R2','N'])


# RUN FAMA-MACBETH ALPHA ~ LAGE + LSIZE + MFEE + IFEE IN STATA #
stata.run('''
          sort FUNDID MONTH 
          xtset FUNDID MONTH
          asreg HF9 LAGE LSIZE MFEE IFEE, fmb newey(3)
          ''')
res = stata.get_ereturn()


# GET PARAMETERS #
fmOut_df.loc[['LAGE','LSIZE','MFEE','IFEE','INTERCEPT'], '(1)'] = res['e(b)']
fmOut_df.loc[['LAGE-t','LSIZE-t','MFEE-t','IFEE-t','INTERCEPT-t'], '(1)'] = res['e(b)'] / np.sqrt(np.diag(res['e(V)']))

fmOut_df.loc['Adj-R2', '(1)'] = res['e(r2_a)']
fmOut_df.loc['N', '(1)'] = res['e(N)']


# RUN FAMA-MACBETH ALPHA ~ LAGE + LSIZE  + MFEE + IFEE + HWM + LCKP + NTC IN STATA #
stata.run('''
          sort FUNDID MONTH 
          xtset FUNDID MONTH
          asreg HF9 LAGE LSIZE MFEE IFEE HWM LCKP NTC, fmb newey(3)
          ''')
res = stata.get_ereturn()


# GET PARAMETERS #
fmOut_df.loc[['LAGE','LSIZE','MFEE','IFEE','HWM','LCKP','NTC','INTERCEPT'], '(2)'] = res['e(b)']
fmOut_df.loc[['LAGE-t','LSIZE-t','MFEE-t','IFEE-t','HWM-t','LCKP-t','NTC-t','INTERCEPT-t'], '(2)'] = res['e(b)'] / np.sqrt(np.diag(res['e(V)']))

fmOut_df.loc['Adj-R2', '(2)'] = res['e(r2_a)']
fmOut_df.loc['N', '(2)'] = res['e(N)']


# RUN FAMA-MACBETH ALPHA ~ LAGE + LSIZE  + MFEE + IFEE + HWM + LCKP + NTC + STYLES IN STATA #
stata.run('''
          sort FUNDID MONTH 
          xtset FUNDID MONTH
          asreg HF9 LAGE LSIZE MFEE IFEE HWM LCKP NTC CAT_*, fmb newey(3)
          ''')
res = stata.get_ereturn()


# GET PARAMETERS #
fmOut_df.loc[['LAGE','LSIZE','MFEE','IFEE','HWM','LCKP','NTC','CAT_DSB','CAT_ED',
              'CAT_EM','CAT_EMN','CAT_FIA','CAT_FOF','CAT_GM','CAT_LSEH','CAT_MS',
              'INTERCEPT'], '(3)'] = res['e(b)']
fmOut_df.loc[['LAGE-t','LSIZE-t','MFEE-t','IFEE-t','HWM-t','LCKP-t','NTC-t','CAT_DSB-t',
              'CAT_ED-t','CAT_EM-t','CAT_EMN-t','CAT_FIA-t','CAT_FOF-t','CAT_GM-t',
              'CAT_LSEH-t','CAT_MS-t','INTERCEPT-t'], '(3)'] = res['e(b)'] / np.sqrt(np.diag(res['e(V)']))

fmOut_df.loc['Adj-R2', '(3)'] = res['e(r2_a)']
fmOut_df.loc['N', '(3)'] = res['e(N)']


# HOUSE CLEAN #
stata.run("clear")


# OUTPUT RESULTS #
fmOut_df.loc['STYLE DUMMIES',:] = ['No','No','Yes']
fmOut_df = fmOut_df.loc[['INTERCEPT','INTERCEPT-t','LAGE','LAGE-t',
                         'LSIZE','LSIZE-t','MFEE','MFEE-t','IFEE','IFEE-t',
                         'HWM','HWM-t','LCKP','LCKP-t','NTC','NTC-t',
                         'STYLE DUMMIES','Adj-R2','N'],:].copy()
fmOut_df.to_excel(tabDir + '\\Table14.xlsx')


# HOUSE CLEAN #
del dataDir, factor_df, figDir, fmOut_df, hfAlphaChar_df, hfAlpha_df, hfChar
del hfExRet_df, res, stataConfig, stataEdition, tabDir

